1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 static void sock_inuse_add(struct net *net, int val);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)159 bool sk_ns_capable(const struct sock *sk,
160 		   struct user_namespace *user_ns, int cap)
161 {
162 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163 		ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
sk_capable(const struct sock * sk,int cap)176 bool sk_capable(const struct sock *sk, int cap)
177 {
178 	return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
sk_net_capable(const struct sock * sk,int cap)191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 /*
198  * Each address family might have different locking rules, so we have
199  * one slock key per address family and separate keys for internal and
200  * userspace sockets.
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_keys[AF_MAX];
204 static struct lock_class_key af_family_slock_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
206 
207 /*
208  * Make lock validator output more readable. (we pre-construct these
209  * strings build-time, so that runtime initialization of socket
210  * locks is fast):
211  */
212 
213 #define _sock_locks(x)						  \
214   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
215   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
216   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
217   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
218   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
219   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
220   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
221   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
222   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
223   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
224   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
225   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
226   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
227   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
228   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
229   x "AF_MCTP"  , \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("rlock-")
253 };
254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("wlock-")
256 };
257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
258 	_sock_locks("elock-")
259 };
260 
261 /*
262  * sk_callback_lock and sk queues locking rules are per-address-family,
263  * so split the lock classes by using a per-AF key:
264  */
265 static struct lock_class_key af_callback_keys[AF_MAX];
266 static struct lock_class_key af_rlock_keys[AF_MAX];
267 static struct lock_class_key af_wlock_keys[AF_MAX];
268 static struct lock_class_key af_elock_keys[AF_MAX];
269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
270 
271 /* Run time adjustable parameters. */
272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
273 EXPORT_SYMBOL(sysctl_wmem_max);
274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
275 EXPORT_SYMBOL(sysctl_rmem_max);
276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
278 
279 /* Maximal space eaten by iovec or ancillary data plus some space */
280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
281 EXPORT_SYMBOL(sysctl_optmem_max);
282 
283 int sysctl_tstamp_allow_data __read_mostly = 1;
284 
285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
287 
288 /**
289  * sk_set_memalloc - sets %SOCK_MEMALLOC
290  * @sk: socket to set it on
291  *
292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
293  * It's the responsibility of the admin to adjust min_free_kbytes
294  * to meet the requirements
295  */
sk_set_memalloc(struct sock * sk)296 void sk_set_memalloc(struct sock *sk)
297 {
298 	sock_set_flag(sk, SOCK_MEMALLOC);
299 	sk->sk_allocation |= __GFP_MEMALLOC;
300 	static_branch_inc(&memalloc_socks_key);
301 }
302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
303 
sk_clear_memalloc(struct sock * sk)304 void sk_clear_memalloc(struct sock *sk)
305 {
306 	sock_reset_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation &= ~__GFP_MEMALLOC;
308 	static_branch_dec(&memalloc_socks_key);
309 
310 	/*
311 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
312 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
313 	 * it has rmem allocations due to the last swapfile being deactivated
314 	 * but there is a risk that the socket is unusable due to exceeding
315 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
316 	 */
317 	sk_mem_reclaim(sk);
318 }
319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
320 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
322 {
323 	int ret;
324 	unsigned int noreclaim_flag;
325 
326 	/* these should have been dropped before queueing */
327 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328 
329 	noreclaim_flag = memalloc_noreclaim_save();
330 	ret = sk->sk_backlog_rcv(sk, skb);
331 	memalloc_noreclaim_restore(noreclaim_flag);
332 
333 	return ret;
334 }
335 EXPORT_SYMBOL(__sk_backlog_rcv);
336 
sk_error_report(struct sock * sk)337 void sk_error_report(struct sock *sk)
338 {
339 	sk->sk_error_report(sk);
340 
341 	switch (sk->sk_family) {
342 	case AF_INET:
343 		fallthrough;
344 	case AF_INET6:
345 		trace_inet_sk_error_report(sk);
346 		break;
347 	default:
348 		break;
349 	}
350 }
351 EXPORT_SYMBOL(sk_error_report);
352 
sock_get_timeout(long timeo,void * optval,bool old_timeval)353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
354 {
355 	struct __kernel_sock_timeval tv;
356 
357 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
358 		tv.tv_sec = 0;
359 		tv.tv_usec = 0;
360 	} else {
361 		tv.tv_sec = timeo / HZ;
362 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
363 	}
364 
365 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
366 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
367 		*(struct old_timeval32 *)optval = tv32;
368 		return sizeof(tv32);
369 	}
370 
371 	if (old_timeval) {
372 		struct __kernel_old_timeval old_tv;
373 		old_tv.tv_sec = tv.tv_sec;
374 		old_tv.tv_usec = tv.tv_usec;
375 		*(struct __kernel_old_timeval *)optval = old_tv;
376 		return sizeof(old_tv);
377 	}
378 
379 	*(struct __kernel_sock_timeval *)optval = tv;
380 	return sizeof(tv);
381 }
382 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
384 			    bool old_timeval)
385 {
386 	struct __kernel_sock_timeval tv;
387 
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv.tv_sec = tv32.tv_sec;
397 		tv.tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv.tv_sec = old_tv.tv_sec;
406 		tv.tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
411 			return -EFAULT;
412 	}
413 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
414 		return -EDOM;
415 
416 	if (tv.tv_sec < 0) {
417 		static int warned __read_mostly;
418 
419 		*timeo_p = 0;
420 		if (warned < 10 && net_ratelimit()) {
421 			warned++;
422 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
423 				__func__, current->comm, task_pid_nr(current));
424 		}
425 		return 0;
426 	}
427 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
428 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
429 		return 0;
430 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
431 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
432 	return 0;
433 }
434 
sock_needs_netstamp(const struct sock * sk)435 static bool sock_needs_netstamp(const struct sock *sk)
436 {
437 	switch (sk->sk_family) {
438 	case AF_UNSPEC:
439 	case AF_UNIX:
440 		return false;
441 	default:
442 		return true;
443 	}
444 }
445 
sock_disable_timestamp(struct sock * sk,unsigned long flags)446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
447 {
448 	if (sk->sk_flags & flags) {
449 		sk->sk_flags &= ~flags;
450 		if (sock_needs_netstamp(sk) &&
451 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
452 			net_disable_timestamp();
453 	}
454 }
455 
456 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
458 {
459 	unsigned long flags;
460 	struct sk_buff_head *list = &sk->sk_receive_queue;
461 
462 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
463 		atomic_inc(&sk->sk_drops);
464 		trace_sock_rcvqueue_full(sk, skb);
465 		return -ENOMEM;
466 	}
467 
468 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
469 		atomic_inc(&sk->sk_drops);
470 		return -ENOBUFS;
471 	}
472 
473 	skb->dev = NULL;
474 	skb_set_owner_r(skb, sk);
475 
476 	/* we escape from rcu protected region, make sure we dont leak
477 	 * a norefcounted dst
478 	 */
479 	skb_dst_force(skb);
480 
481 	spin_lock_irqsave(&list->lock, flags);
482 	sock_skb_set_dropcount(sk, skb);
483 	__skb_queue_tail(list, skb);
484 	spin_unlock_irqrestore(&list->lock, flags);
485 
486 	if (!sock_flag(sk, SOCK_DEAD))
487 		sk->sk_data_ready(sk);
488 	return 0;
489 }
490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
491 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
493 {
494 	int err;
495 
496 	err = sk_filter(sk, skb);
497 	if (err)
498 		return err;
499 
500 	return __sock_queue_rcv_skb(sk, skb);
501 }
502 EXPORT_SYMBOL(sock_queue_rcv_skb);
503 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
505 		     const int nested, unsigned int trim_cap, bool refcounted)
506 {
507 	int rc = NET_RX_SUCCESS;
508 
509 	if (sk_filter_trim_cap(sk, skb, trim_cap))
510 		goto discard_and_relse;
511 
512 	skb->dev = NULL;
513 
514 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
515 		atomic_inc(&sk->sk_drops);
516 		goto discard_and_relse;
517 	}
518 	if (nested)
519 		bh_lock_sock_nested(sk);
520 	else
521 		bh_lock_sock(sk);
522 	if (!sock_owned_by_user(sk)) {
523 		/*
524 		 * trylock + unlock semantics:
525 		 */
526 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
527 
528 		rc = sk_backlog_rcv(sk, skb);
529 
530 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
531 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
532 		bh_unlock_sock(sk);
533 		atomic_inc(&sk->sk_drops);
534 		goto discard_and_relse;
535 	}
536 
537 	bh_unlock_sock(sk);
538 out:
539 	if (refcounted)
540 		sock_put(sk);
541 	return rc;
542 discard_and_relse:
543 	kfree_skb(skb);
544 	goto out;
545 }
546 EXPORT_SYMBOL(__sk_receive_skb);
547 
548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
549 							  u32));
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
551 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
553 {
554 	struct dst_entry *dst = __sk_dst_get(sk);
555 
556 	if (dst && dst->obsolete &&
557 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
558 			       dst, cookie) == NULL) {
559 		sk_tx_queue_clear(sk);
560 		sk->sk_dst_pending_confirm = 0;
561 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
562 		dst_release(dst);
563 		return NULL;
564 	}
565 
566 	return dst;
567 }
568 EXPORT_SYMBOL(__sk_dst_check);
569 
sk_dst_check(struct sock * sk,u32 cookie)570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
571 {
572 	struct dst_entry *dst = sk_dst_get(sk);
573 
574 	if (dst && dst->obsolete &&
575 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
576 			       dst, cookie) == NULL) {
577 		sk_dst_reset(sk);
578 		dst_release(dst);
579 		return NULL;
580 	}
581 
582 	return dst;
583 }
584 EXPORT_SYMBOL(sk_dst_check);
585 
sock_bindtoindex_locked(struct sock * sk,int ifindex)586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
587 {
588 	int ret = -ENOPROTOOPT;
589 #ifdef CONFIG_NETDEVICES
590 	struct net *net = sock_net(sk);
591 
592 	/* Sorry... */
593 	ret = -EPERM;
594 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
595 		goto out;
596 
597 	ret = -EINVAL;
598 	if (ifindex < 0)
599 		goto out;
600 
601 	sk->sk_bound_dev_if = ifindex;
602 	if (sk->sk_prot->rehash)
603 		sk->sk_prot->rehash(sk);
604 	sk_dst_reset(sk);
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
615 {
616 	int ret;
617 
618 	if (lock_sk)
619 		lock_sock(sk);
620 	ret = sock_bindtoindex_locked(sk, ifindex);
621 	if (lock_sk)
622 		release_sock(sk);
623 
624 	return ret;
625 }
626 EXPORT_SYMBOL(sock_bindtoindex);
627 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
629 {
630 	int ret = -ENOPROTOOPT;
631 #ifdef CONFIG_NETDEVICES
632 	struct net *net = sock_net(sk);
633 	char devname[IFNAMSIZ];
634 	int index;
635 
636 	ret = -EINVAL;
637 	if (optlen < 0)
638 		goto out;
639 
640 	/* Bind this socket to a particular device like "eth0",
641 	 * as specified in the passed interface name. If the
642 	 * name is "" or the option length is zero the socket
643 	 * is not bound.
644 	 */
645 	if (optlen > IFNAMSIZ - 1)
646 		optlen = IFNAMSIZ - 1;
647 	memset(devname, 0, sizeof(devname));
648 
649 	ret = -EFAULT;
650 	if (copy_from_sockptr(devname, optval, optlen))
651 		goto out;
652 
653 	index = 0;
654 	if (devname[0] != '\0') {
655 		struct net_device *dev;
656 
657 		rcu_read_lock();
658 		dev = dev_get_by_name_rcu(net, devname);
659 		if (dev)
660 			index = dev->ifindex;
661 		rcu_read_unlock();
662 		ret = -ENODEV;
663 		if (!dev)
664 			goto out;
665 	}
666 
667 	return sock_bindtoindex(sk, index, true);
668 out:
669 #endif
670 
671 	return ret;
672 }
673 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
675 				int __user *optlen, int len)
676 {
677 	int ret = -ENOPROTOOPT;
678 #ifdef CONFIG_NETDEVICES
679 	struct net *net = sock_net(sk);
680 	char devname[IFNAMSIZ];
681 
682 	if (sk->sk_bound_dev_if == 0) {
683 		len = 0;
684 		goto zero;
685 	}
686 
687 	ret = -EINVAL;
688 	if (len < IFNAMSIZ)
689 		goto out;
690 
691 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
692 	if (ret)
693 		goto out;
694 
695 	len = strlen(devname) + 1;
696 
697 	ret = -EFAULT;
698 	if (copy_to_user(optval, devname, len))
699 		goto out;
700 
701 zero:
702 	ret = -EFAULT;
703 	if (put_user(len, optlen))
704 		goto out;
705 
706 	ret = 0;
707 
708 out:
709 #endif
710 
711 	return ret;
712 }
713 
sk_mc_loop(struct sock * sk)714 bool sk_mc_loop(struct sock *sk)
715 {
716 	if (dev_recursion_level())
717 		return false;
718 	if (!sk)
719 		return true;
720 	switch (sk->sk_family) {
721 	case AF_INET:
722 		return inet_sk(sk)->mc_loop;
723 #if IS_ENABLED(CONFIG_IPV6)
724 	case AF_INET6:
725 		return inet6_sk(sk)->mc_loop;
726 #endif
727 	}
728 	WARN_ON_ONCE(1);
729 	return true;
730 }
731 EXPORT_SYMBOL(sk_mc_loop);
732 
sock_set_reuseaddr(struct sock * sk)733 void sock_set_reuseaddr(struct sock *sk)
734 {
735 	lock_sock(sk);
736 	sk->sk_reuse = SK_CAN_REUSE;
737 	release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_reuseaddr);
740 
sock_set_reuseport(struct sock * sk)741 void sock_set_reuseport(struct sock *sk)
742 {
743 	lock_sock(sk);
744 	sk->sk_reuseport = true;
745 	release_sock(sk);
746 }
747 EXPORT_SYMBOL(sock_set_reuseport);
748 
sock_no_linger(struct sock * sk)749 void sock_no_linger(struct sock *sk)
750 {
751 	lock_sock(sk);
752 	sk->sk_lingertime = 0;
753 	sock_set_flag(sk, SOCK_LINGER);
754 	release_sock(sk);
755 }
756 EXPORT_SYMBOL(sock_no_linger);
757 
sock_set_priority(struct sock * sk,u32 priority)758 void sock_set_priority(struct sock *sk, u32 priority)
759 {
760 	lock_sock(sk);
761 	sk->sk_priority = priority;
762 	release_sock(sk);
763 }
764 EXPORT_SYMBOL(sock_set_priority);
765 
sock_set_sndtimeo(struct sock * sk,s64 secs)766 void sock_set_sndtimeo(struct sock *sk, s64 secs)
767 {
768 	lock_sock(sk);
769 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
770 		sk->sk_sndtimeo = secs * HZ;
771 	else
772 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
773 	release_sock(sk);
774 }
775 EXPORT_SYMBOL(sock_set_sndtimeo);
776 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
778 {
779 	if (val)  {
780 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
781 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
782 		sock_set_flag(sk, SOCK_RCVTSTAMP);
783 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
784 	} else {
785 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
786 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787 	}
788 }
789 
sock_enable_timestamps(struct sock * sk)790 void sock_enable_timestamps(struct sock *sk)
791 {
792 	lock_sock(sk);
793 	__sock_set_timestamps(sk, true, false, true);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_enable_timestamps);
797 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
799 {
800 	switch (optname) {
801 	case SO_TIMESTAMP_OLD:
802 		__sock_set_timestamps(sk, valbool, false, false);
803 		break;
804 	case SO_TIMESTAMP_NEW:
805 		__sock_set_timestamps(sk, valbool, true, false);
806 		break;
807 	case SO_TIMESTAMPNS_OLD:
808 		__sock_set_timestamps(sk, valbool, false, true);
809 		break;
810 	case SO_TIMESTAMPNS_NEW:
811 		__sock_set_timestamps(sk, valbool, true, true);
812 		break;
813 	}
814 }
815 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
817 {
818 	struct net *net = sock_net(sk);
819 	struct net_device *dev = NULL;
820 	bool match = false;
821 	int *vclock_index;
822 	int i, num;
823 
824 	if (sk->sk_bound_dev_if)
825 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
826 
827 	if (!dev) {
828 		pr_err("%s: sock not bind to device\n", __func__);
829 		return -EOPNOTSUPP;
830 	}
831 
832 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
833 	for (i = 0; i < num; i++) {
834 		if (*(vclock_index + i) == phc_index) {
835 			match = true;
836 			break;
837 		}
838 	}
839 
840 	if (num > 0)
841 		kfree(vclock_index);
842 
843 	if (!match)
844 		return -EINVAL;
845 
846 	sk->sk_bind_phc = phc_index;
847 
848 	return 0;
849 }
850 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)851 int sock_set_timestamping(struct sock *sk, int optname,
852 			  struct so_timestamping timestamping)
853 {
854 	int val = timestamping.flags;
855 	int ret;
856 
857 	if (val & ~SOF_TIMESTAMPING_MASK)
858 		return -EINVAL;
859 
860 	if (val & SOF_TIMESTAMPING_OPT_ID &&
861 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
862 		if (sk->sk_protocol == IPPROTO_TCP &&
863 		    sk->sk_type == SOCK_STREAM) {
864 			if ((1 << sk->sk_state) &
865 			    (TCPF_CLOSE | TCPF_LISTEN))
866 				return -EINVAL;
867 			sk->sk_tskey = tcp_sk(sk)->snd_una;
868 		} else {
869 			sk->sk_tskey = 0;
870 		}
871 	}
872 
873 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
874 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
875 		return -EINVAL;
876 
877 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
878 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
879 		if (ret)
880 			return ret;
881 	}
882 
883 	sk->sk_tsflags = val;
884 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
885 
886 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
887 		sock_enable_timestamp(sk,
888 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
889 	else
890 		sock_disable_timestamp(sk,
891 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
892 	return 0;
893 }
894 
sock_set_keepalive(struct sock * sk)895 void sock_set_keepalive(struct sock *sk)
896 {
897 	lock_sock(sk);
898 	if (sk->sk_prot->keepalive)
899 		sk->sk_prot->keepalive(sk, true);
900 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
901 	release_sock(sk);
902 }
903 EXPORT_SYMBOL(sock_set_keepalive);
904 
__sock_set_rcvbuf(struct sock * sk,int val)905 static void __sock_set_rcvbuf(struct sock *sk, int val)
906 {
907 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
908 	 * as a negative value.
909 	 */
910 	val = min_t(int, val, INT_MAX / 2);
911 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
912 
913 	/* We double it on the way in to account for "struct sk_buff" etc.
914 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
915 	 * will allow that much actual data to be received on that socket.
916 	 *
917 	 * Applications are unaware that "struct sk_buff" and other overheads
918 	 * allocate from the receive buffer during socket buffer allocation.
919 	 *
920 	 * And after considering the possible alternatives, returning the value
921 	 * we actually used in getsockopt is the most desirable behavior.
922 	 */
923 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
924 }
925 
sock_set_rcvbuf(struct sock * sk,int val)926 void sock_set_rcvbuf(struct sock *sk, int val)
927 {
928 	lock_sock(sk);
929 	__sock_set_rcvbuf(sk, val);
930 	release_sock(sk);
931 }
932 EXPORT_SYMBOL(sock_set_rcvbuf);
933 
__sock_set_mark(struct sock * sk,u32 val)934 static void __sock_set_mark(struct sock *sk, u32 val)
935 {
936 	if (val != sk->sk_mark) {
937 		sk->sk_mark = val;
938 		sk_dst_reset(sk);
939 	}
940 }
941 
sock_set_mark(struct sock * sk,u32 val)942 void sock_set_mark(struct sock *sk, u32 val)
943 {
944 	lock_sock(sk);
945 	__sock_set_mark(sk, val);
946 	release_sock(sk);
947 }
948 EXPORT_SYMBOL(sock_set_mark);
949 
950 /*
951  *	This is meant for all protocols to use and covers goings on
952  *	at the socket level. Everything here is generic.
953  */
954 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)955 int sock_setsockopt(struct socket *sock, int level, int optname,
956 		    sockptr_t optval, unsigned int optlen)
957 {
958 	struct so_timestamping timestamping;
959 	struct sock_txtime sk_txtime;
960 	struct sock *sk = sock->sk;
961 	int val;
962 	int valbool;
963 	struct linger ling;
964 	int ret = 0;
965 
966 	/*
967 	 *	Options without arguments
968 	 */
969 
970 	if (optname == SO_BINDTODEVICE)
971 		return sock_setbindtodevice(sk, optval, optlen);
972 
973 	if (optlen < sizeof(int))
974 		return -EINVAL;
975 
976 	if (copy_from_sockptr(&val, optval, sizeof(val)))
977 		return -EFAULT;
978 
979 	valbool = val ? 1 : 0;
980 
981 	lock_sock(sk);
982 
983 	switch (optname) {
984 	case SO_DEBUG:
985 		if (val && !capable(CAP_NET_ADMIN))
986 			ret = -EACCES;
987 		else
988 			sock_valbool_flag(sk, SOCK_DBG, valbool);
989 		break;
990 	case SO_REUSEADDR:
991 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
992 		break;
993 	case SO_REUSEPORT:
994 		sk->sk_reuseport = valbool;
995 		break;
996 	case SO_TYPE:
997 	case SO_PROTOCOL:
998 	case SO_DOMAIN:
999 	case SO_ERROR:
1000 		ret = -ENOPROTOOPT;
1001 		break;
1002 	case SO_DONTROUTE:
1003 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1004 		sk_dst_reset(sk);
1005 		break;
1006 	case SO_BROADCAST:
1007 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1008 		break;
1009 	case SO_SNDBUF:
1010 		/* Don't error on this BSD doesn't and if you think
1011 		 * about it this is right. Otherwise apps have to
1012 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1013 		 * are treated in BSD as hints
1014 		 */
1015 		val = min_t(u32, val, sysctl_wmem_max);
1016 set_sndbuf:
1017 		/* Ensure val * 2 fits into an int, to prevent max_t()
1018 		 * from treating it as a negative value.
1019 		 */
1020 		val = min_t(int, val, INT_MAX / 2);
1021 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1022 		WRITE_ONCE(sk->sk_sndbuf,
1023 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1024 		/* Wake up sending tasks if we upped the value. */
1025 		sk->sk_write_space(sk);
1026 		break;
1027 
1028 	case SO_SNDBUFFORCE:
1029 		if (!capable(CAP_NET_ADMIN)) {
1030 			ret = -EPERM;
1031 			break;
1032 		}
1033 
1034 		/* No negative values (to prevent underflow, as val will be
1035 		 * multiplied by 2).
1036 		 */
1037 		if (val < 0)
1038 			val = 0;
1039 		goto set_sndbuf;
1040 
1041 	case SO_RCVBUF:
1042 		/* Don't error on this BSD doesn't and if you think
1043 		 * about it this is right. Otherwise apps have to
1044 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1045 		 * are treated in BSD as hints
1046 		 */
1047 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1048 		break;
1049 
1050 	case SO_RCVBUFFORCE:
1051 		if (!capable(CAP_NET_ADMIN)) {
1052 			ret = -EPERM;
1053 			break;
1054 		}
1055 
1056 		/* No negative values (to prevent underflow, as val will be
1057 		 * multiplied by 2).
1058 		 */
1059 		__sock_set_rcvbuf(sk, max(val, 0));
1060 		break;
1061 
1062 	case SO_KEEPALIVE:
1063 		if (sk->sk_prot->keepalive)
1064 			sk->sk_prot->keepalive(sk, valbool);
1065 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1066 		break;
1067 
1068 	case SO_OOBINLINE:
1069 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1070 		break;
1071 
1072 	case SO_NO_CHECK:
1073 		sk->sk_no_check_tx = valbool;
1074 		break;
1075 
1076 	case SO_PRIORITY:
1077 		if ((val >= 0 && val <= 6) ||
1078 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1079 			sk->sk_priority = val;
1080 		else
1081 			ret = -EPERM;
1082 		break;
1083 
1084 	case SO_LINGER:
1085 		if (optlen < sizeof(ling)) {
1086 			ret = -EINVAL;	/* 1003.1g */
1087 			break;
1088 		}
1089 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1090 			ret = -EFAULT;
1091 			break;
1092 		}
1093 		if (!ling.l_onoff)
1094 			sock_reset_flag(sk, SOCK_LINGER);
1095 		else {
1096 #if (BITS_PER_LONG == 32)
1097 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1098 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1099 			else
1100 #endif
1101 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1102 			sock_set_flag(sk, SOCK_LINGER);
1103 		}
1104 		break;
1105 
1106 	case SO_BSDCOMPAT:
1107 		break;
1108 
1109 	case SO_PASSCRED:
1110 		if (valbool)
1111 			set_bit(SOCK_PASSCRED, &sock->flags);
1112 		else
1113 			clear_bit(SOCK_PASSCRED, &sock->flags);
1114 		break;
1115 
1116 	case SO_TIMESTAMP_OLD:
1117 	case SO_TIMESTAMP_NEW:
1118 	case SO_TIMESTAMPNS_OLD:
1119 	case SO_TIMESTAMPNS_NEW:
1120 		sock_set_timestamp(sk, optname, valbool);
1121 		break;
1122 
1123 	case SO_TIMESTAMPING_NEW:
1124 	case SO_TIMESTAMPING_OLD:
1125 		if (optlen == sizeof(timestamping)) {
1126 			if (copy_from_sockptr(&timestamping, optval,
1127 					      sizeof(timestamping))) {
1128 				ret = -EFAULT;
1129 				break;
1130 			}
1131 		} else {
1132 			memset(&timestamping, 0, sizeof(timestamping));
1133 			timestamping.flags = val;
1134 		}
1135 		ret = sock_set_timestamping(sk, optname, timestamping);
1136 		break;
1137 
1138 	case SO_RCVLOWAT:
1139 		if (val < 0)
1140 			val = INT_MAX;
1141 		if (sock->ops->set_rcvlowat)
1142 			ret = sock->ops->set_rcvlowat(sk, val);
1143 		else
1144 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1145 		break;
1146 
1147 	case SO_RCVTIMEO_OLD:
1148 	case SO_RCVTIMEO_NEW:
1149 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1150 				       optlen, optname == SO_RCVTIMEO_OLD);
1151 		break;
1152 
1153 	case SO_SNDTIMEO_OLD:
1154 	case SO_SNDTIMEO_NEW:
1155 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1156 				       optlen, optname == SO_SNDTIMEO_OLD);
1157 		break;
1158 
1159 	case SO_ATTACH_FILTER: {
1160 		struct sock_fprog fprog;
1161 
1162 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1163 		if (!ret)
1164 			ret = sk_attach_filter(&fprog, sk);
1165 		break;
1166 	}
1167 	case SO_ATTACH_BPF:
1168 		ret = -EINVAL;
1169 		if (optlen == sizeof(u32)) {
1170 			u32 ufd;
1171 
1172 			ret = -EFAULT;
1173 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1174 				break;
1175 
1176 			ret = sk_attach_bpf(ufd, sk);
1177 		}
1178 		break;
1179 
1180 	case SO_ATTACH_REUSEPORT_CBPF: {
1181 		struct sock_fprog fprog;
1182 
1183 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1184 		if (!ret)
1185 			ret = sk_reuseport_attach_filter(&fprog, sk);
1186 		break;
1187 	}
1188 	case SO_ATTACH_REUSEPORT_EBPF:
1189 		ret = -EINVAL;
1190 		if (optlen == sizeof(u32)) {
1191 			u32 ufd;
1192 
1193 			ret = -EFAULT;
1194 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1195 				break;
1196 
1197 			ret = sk_reuseport_attach_bpf(ufd, sk);
1198 		}
1199 		break;
1200 
1201 	case SO_DETACH_REUSEPORT_BPF:
1202 		ret = reuseport_detach_prog(sk);
1203 		break;
1204 
1205 	case SO_DETACH_FILTER:
1206 		ret = sk_detach_filter(sk);
1207 		break;
1208 
1209 	case SO_LOCK_FILTER:
1210 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1211 			ret = -EPERM;
1212 		else
1213 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1214 		break;
1215 
1216 	case SO_PASSSEC:
1217 		if (valbool)
1218 			set_bit(SOCK_PASSSEC, &sock->flags);
1219 		else
1220 			clear_bit(SOCK_PASSSEC, &sock->flags);
1221 		break;
1222 	case SO_MARK:
1223 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1224 			ret = -EPERM;
1225 			break;
1226 		}
1227 
1228 		__sock_set_mark(sk, val);
1229 		break;
1230 
1231 	case SO_RXQ_OVFL:
1232 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1233 		break;
1234 
1235 	case SO_WIFI_STATUS:
1236 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1237 		break;
1238 
1239 	case SO_PEEK_OFF:
1240 		if (sock->ops->set_peek_off)
1241 			ret = sock->ops->set_peek_off(sk, val);
1242 		else
1243 			ret = -EOPNOTSUPP;
1244 		break;
1245 
1246 	case SO_NOFCS:
1247 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1248 		break;
1249 
1250 	case SO_SELECT_ERR_QUEUE:
1251 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1252 		break;
1253 
1254 #ifdef CONFIG_NET_RX_BUSY_POLL
1255 	case SO_BUSY_POLL:
1256 		/* allow unprivileged users to decrease the value */
1257 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1258 			ret = -EPERM;
1259 		else {
1260 			if (val < 0)
1261 				ret = -EINVAL;
1262 			else
1263 				WRITE_ONCE(sk->sk_ll_usec, val);
1264 		}
1265 		break;
1266 	case SO_PREFER_BUSY_POLL:
1267 		if (valbool && !capable(CAP_NET_ADMIN))
1268 			ret = -EPERM;
1269 		else
1270 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1271 		break;
1272 	case SO_BUSY_POLL_BUDGET:
1273 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1274 			ret = -EPERM;
1275 		} else {
1276 			if (val < 0 || val > U16_MAX)
1277 				ret = -EINVAL;
1278 			else
1279 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1280 		}
1281 		break;
1282 #endif
1283 
1284 	case SO_MAX_PACING_RATE:
1285 		{
1286 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1287 
1288 		if (sizeof(ulval) != sizeof(val) &&
1289 		    optlen >= sizeof(ulval) &&
1290 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1291 			ret = -EFAULT;
1292 			break;
1293 		}
1294 		if (ulval != ~0UL)
1295 			cmpxchg(&sk->sk_pacing_status,
1296 				SK_PACING_NONE,
1297 				SK_PACING_NEEDED);
1298 		sk->sk_max_pacing_rate = ulval;
1299 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1300 		break;
1301 		}
1302 	case SO_INCOMING_CPU:
1303 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1304 		break;
1305 
1306 	case SO_CNX_ADVICE:
1307 		if (val == 1)
1308 			dst_negative_advice(sk);
1309 		break;
1310 
1311 	case SO_ZEROCOPY:
1312 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1313 			if (!((sk->sk_type == SOCK_STREAM &&
1314 			       sk->sk_protocol == IPPROTO_TCP) ||
1315 			      (sk->sk_type == SOCK_DGRAM &&
1316 			       sk->sk_protocol == IPPROTO_UDP)))
1317 				ret = -ENOTSUPP;
1318 		} else if (sk->sk_family != PF_RDS) {
1319 			ret = -ENOTSUPP;
1320 		}
1321 		if (!ret) {
1322 			if (val < 0 || val > 1)
1323 				ret = -EINVAL;
1324 			else
1325 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1326 		}
1327 		break;
1328 
1329 	case SO_TXTIME:
1330 		if (optlen != sizeof(struct sock_txtime)) {
1331 			ret = -EINVAL;
1332 			break;
1333 		} else if (copy_from_sockptr(&sk_txtime, optval,
1334 			   sizeof(struct sock_txtime))) {
1335 			ret = -EFAULT;
1336 			break;
1337 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1338 			ret = -EINVAL;
1339 			break;
1340 		}
1341 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1342 		 * scheduler has enough safe guards.
1343 		 */
1344 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1345 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1346 			ret = -EPERM;
1347 			break;
1348 		}
1349 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1350 		sk->sk_clockid = sk_txtime.clockid;
1351 		sk->sk_txtime_deadline_mode =
1352 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1353 		sk->sk_txtime_report_errors =
1354 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1355 		break;
1356 
1357 	case SO_BINDTOIFINDEX:
1358 		ret = sock_bindtoindex_locked(sk, val);
1359 		break;
1360 
1361 	case SO_BUF_LOCK:
1362 		if (val & ~SOCK_BUF_LOCK_MASK) {
1363 			ret = -EINVAL;
1364 			break;
1365 		}
1366 		sk->sk_userlocks = val | (sk->sk_userlocks &
1367 					  ~SOCK_BUF_LOCK_MASK);
1368 		break;
1369 
1370 	default:
1371 		ret = -ENOPROTOOPT;
1372 		break;
1373 	}
1374 	release_sock(sk);
1375 	return ret;
1376 }
1377 EXPORT_SYMBOL(sock_setsockopt);
1378 
sk_get_peer_cred(struct sock * sk)1379 static const struct cred *sk_get_peer_cred(struct sock *sk)
1380 {
1381 	const struct cred *cred;
1382 
1383 	spin_lock(&sk->sk_peer_lock);
1384 	cred = get_cred(sk->sk_peer_cred);
1385 	spin_unlock(&sk->sk_peer_lock);
1386 
1387 	return cred;
1388 }
1389 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1390 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1391 			  struct ucred *ucred)
1392 {
1393 	ucred->pid = pid_vnr(pid);
1394 	ucred->uid = ucred->gid = -1;
1395 	if (cred) {
1396 		struct user_namespace *current_ns = current_user_ns();
1397 
1398 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1399 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1400 	}
1401 }
1402 
groups_to_user(gid_t __user * dst,const struct group_info * src)1403 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1404 {
1405 	struct user_namespace *user_ns = current_user_ns();
1406 	int i;
1407 
1408 	for (i = 0; i < src->ngroups; i++)
1409 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1410 			return -EFAULT;
1411 
1412 	return 0;
1413 }
1414 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1415 int sock_getsockopt(struct socket *sock, int level, int optname,
1416 		    char __user *optval, int __user *optlen)
1417 {
1418 	struct sock *sk = sock->sk;
1419 
1420 	union {
1421 		int val;
1422 		u64 val64;
1423 		unsigned long ulval;
1424 		struct linger ling;
1425 		struct old_timeval32 tm32;
1426 		struct __kernel_old_timeval tm;
1427 		struct  __kernel_sock_timeval stm;
1428 		struct sock_txtime txtime;
1429 		struct so_timestamping timestamping;
1430 	} v;
1431 
1432 	int lv = sizeof(int);
1433 	int len;
1434 
1435 	if (get_user(len, optlen))
1436 		return -EFAULT;
1437 	if (len < 0)
1438 		return -EINVAL;
1439 
1440 	memset(&v, 0, sizeof(v));
1441 
1442 	switch (optname) {
1443 	case SO_DEBUG:
1444 		v.val = sock_flag(sk, SOCK_DBG);
1445 		break;
1446 
1447 	case SO_DONTROUTE:
1448 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1449 		break;
1450 
1451 	case SO_BROADCAST:
1452 		v.val = sock_flag(sk, SOCK_BROADCAST);
1453 		break;
1454 
1455 	case SO_SNDBUF:
1456 		v.val = sk->sk_sndbuf;
1457 		break;
1458 
1459 	case SO_RCVBUF:
1460 		v.val = sk->sk_rcvbuf;
1461 		break;
1462 
1463 	case SO_REUSEADDR:
1464 		v.val = sk->sk_reuse;
1465 		break;
1466 
1467 	case SO_REUSEPORT:
1468 		v.val = sk->sk_reuseport;
1469 		break;
1470 
1471 	case SO_KEEPALIVE:
1472 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1473 		break;
1474 
1475 	case SO_TYPE:
1476 		v.val = sk->sk_type;
1477 		break;
1478 
1479 	case SO_PROTOCOL:
1480 		v.val = sk->sk_protocol;
1481 		break;
1482 
1483 	case SO_DOMAIN:
1484 		v.val = sk->sk_family;
1485 		break;
1486 
1487 	case SO_ERROR:
1488 		v.val = -sock_error(sk);
1489 		if (v.val == 0)
1490 			v.val = xchg(&sk->sk_err_soft, 0);
1491 		break;
1492 
1493 	case SO_OOBINLINE:
1494 		v.val = sock_flag(sk, SOCK_URGINLINE);
1495 		break;
1496 
1497 	case SO_NO_CHECK:
1498 		v.val = sk->sk_no_check_tx;
1499 		break;
1500 
1501 	case SO_PRIORITY:
1502 		v.val = sk->sk_priority;
1503 		break;
1504 
1505 	case SO_LINGER:
1506 		lv		= sizeof(v.ling);
1507 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1508 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1509 		break;
1510 
1511 	case SO_BSDCOMPAT:
1512 		break;
1513 
1514 	case SO_TIMESTAMP_OLD:
1515 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1516 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1517 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1518 		break;
1519 
1520 	case SO_TIMESTAMPNS_OLD:
1521 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1522 		break;
1523 
1524 	case SO_TIMESTAMP_NEW:
1525 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1526 		break;
1527 
1528 	case SO_TIMESTAMPNS_NEW:
1529 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1530 		break;
1531 
1532 	case SO_TIMESTAMPING_OLD:
1533 		lv = sizeof(v.timestamping);
1534 		v.timestamping.flags = sk->sk_tsflags;
1535 		v.timestamping.bind_phc = sk->sk_bind_phc;
1536 		break;
1537 
1538 	case SO_RCVTIMEO_OLD:
1539 	case SO_RCVTIMEO_NEW:
1540 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1541 		break;
1542 
1543 	case SO_SNDTIMEO_OLD:
1544 	case SO_SNDTIMEO_NEW:
1545 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1546 		break;
1547 
1548 	case SO_RCVLOWAT:
1549 		v.val = sk->sk_rcvlowat;
1550 		break;
1551 
1552 	case SO_SNDLOWAT:
1553 		v.val = 1;
1554 		break;
1555 
1556 	case SO_PASSCRED:
1557 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1558 		break;
1559 
1560 	case SO_PEERCRED:
1561 	{
1562 		struct ucred peercred;
1563 		if (len > sizeof(peercred))
1564 			len = sizeof(peercred);
1565 
1566 		spin_lock(&sk->sk_peer_lock);
1567 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1568 		spin_unlock(&sk->sk_peer_lock);
1569 
1570 		if (copy_to_user(optval, &peercred, len))
1571 			return -EFAULT;
1572 		goto lenout;
1573 	}
1574 
1575 	case SO_PEERGROUPS:
1576 	{
1577 		const struct cred *cred;
1578 		int ret, n;
1579 
1580 		cred = sk_get_peer_cred(sk);
1581 		if (!cred)
1582 			return -ENODATA;
1583 
1584 		n = cred->group_info->ngroups;
1585 		if (len < n * sizeof(gid_t)) {
1586 			len = n * sizeof(gid_t);
1587 			put_cred(cred);
1588 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1589 		}
1590 		len = n * sizeof(gid_t);
1591 
1592 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1593 		put_cred(cred);
1594 		if (ret)
1595 			return ret;
1596 		goto lenout;
1597 	}
1598 
1599 	case SO_PEERNAME:
1600 	{
1601 		char address[128];
1602 
1603 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1604 		if (lv < 0)
1605 			return -ENOTCONN;
1606 		if (lv < len)
1607 			return -EINVAL;
1608 		if (copy_to_user(optval, address, len))
1609 			return -EFAULT;
1610 		goto lenout;
1611 	}
1612 
1613 	/* Dubious BSD thing... Probably nobody even uses it, but
1614 	 * the UNIX standard wants it for whatever reason... -DaveM
1615 	 */
1616 	case SO_ACCEPTCONN:
1617 		v.val = sk->sk_state == TCP_LISTEN;
1618 		break;
1619 
1620 	case SO_PASSSEC:
1621 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1622 		break;
1623 
1624 	case SO_PEERSEC:
1625 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1626 
1627 	case SO_MARK:
1628 		v.val = sk->sk_mark;
1629 		break;
1630 
1631 	case SO_RXQ_OVFL:
1632 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1633 		break;
1634 
1635 	case SO_WIFI_STATUS:
1636 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1637 		break;
1638 
1639 	case SO_PEEK_OFF:
1640 		if (!sock->ops->set_peek_off)
1641 			return -EOPNOTSUPP;
1642 
1643 		v.val = sk->sk_peek_off;
1644 		break;
1645 	case SO_NOFCS:
1646 		v.val = sock_flag(sk, SOCK_NOFCS);
1647 		break;
1648 
1649 	case SO_BINDTODEVICE:
1650 		return sock_getbindtodevice(sk, optval, optlen, len);
1651 
1652 	case SO_GET_FILTER:
1653 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1654 		if (len < 0)
1655 			return len;
1656 
1657 		goto lenout;
1658 
1659 	case SO_LOCK_FILTER:
1660 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1661 		break;
1662 
1663 	case SO_BPF_EXTENSIONS:
1664 		v.val = bpf_tell_extensions();
1665 		break;
1666 
1667 	case SO_SELECT_ERR_QUEUE:
1668 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1669 		break;
1670 
1671 #ifdef CONFIG_NET_RX_BUSY_POLL
1672 	case SO_BUSY_POLL:
1673 		v.val = sk->sk_ll_usec;
1674 		break;
1675 	case SO_PREFER_BUSY_POLL:
1676 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1677 		break;
1678 #endif
1679 
1680 	case SO_MAX_PACING_RATE:
1681 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1682 			lv = sizeof(v.ulval);
1683 			v.ulval = sk->sk_max_pacing_rate;
1684 		} else {
1685 			/* 32bit version */
1686 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1687 		}
1688 		break;
1689 
1690 	case SO_INCOMING_CPU:
1691 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1692 		break;
1693 
1694 	case SO_MEMINFO:
1695 	{
1696 		u32 meminfo[SK_MEMINFO_VARS];
1697 
1698 		sk_get_meminfo(sk, meminfo);
1699 
1700 		len = min_t(unsigned int, len, sizeof(meminfo));
1701 		if (copy_to_user(optval, &meminfo, len))
1702 			return -EFAULT;
1703 
1704 		goto lenout;
1705 	}
1706 
1707 #ifdef CONFIG_NET_RX_BUSY_POLL
1708 	case SO_INCOMING_NAPI_ID:
1709 		v.val = READ_ONCE(sk->sk_napi_id);
1710 
1711 		/* aggregate non-NAPI IDs down to 0 */
1712 		if (v.val < MIN_NAPI_ID)
1713 			v.val = 0;
1714 
1715 		break;
1716 #endif
1717 
1718 	case SO_COOKIE:
1719 		lv = sizeof(u64);
1720 		if (len < lv)
1721 			return -EINVAL;
1722 		v.val64 = sock_gen_cookie(sk);
1723 		break;
1724 
1725 	case SO_ZEROCOPY:
1726 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1727 		break;
1728 
1729 	case SO_TXTIME:
1730 		lv = sizeof(v.txtime);
1731 		v.txtime.clockid = sk->sk_clockid;
1732 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1733 				  SOF_TXTIME_DEADLINE_MODE : 0;
1734 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1735 				  SOF_TXTIME_REPORT_ERRORS : 0;
1736 		break;
1737 
1738 	case SO_BINDTOIFINDEX:
1739 		v.val = sk->sk_bound_dev_if;
1740 		break;
1741 
1742 	case SO_NETNS_COOKIE:
1743 		lv = sizeof(u64);
1744 		if (len != lv)
1745 			return -EINVAL;
1746 		v.val64 = sock_net(sk)->net_cookie;
1747 		break;
1748 
1749 	case SO_BUF_LOCK:
1750 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1751 		break;
1752 
1753 	default:
1754 		/* We implement the SO_SNDLOWAT etc to not be settable
1755 		 * (1003.1g 7).
1756 		 */
1757 		return -ENOPROTOOPT;
1758 	}
1759 
1760 	if (len > lv)
1761 		len = lv;
1762 	if (copy_to_user(optval, &v, len))
1763 		return -EFAULT;
1764 lenout:
1765 	if (put_user(len, optlen))
1766 		return -EFAULT;
1767 	return 0;
1768 }
1769 
1770 /*
1771  * Initialize an sk_lock.
1772  *
1773  * (We also register the sk_lock with the lock validator.)
1774  */
sock_lock_init(struct sock * sk)1775 static inline void sock_lock_init(struct sock *sk)
1776 {
1777 	if (sk->sk_kern_sock)
1778 		sock_lock_init_class_and_name(
1779 			sk,
1780 			af_family_kern_slock_key_strings[sk->sk_family],
1781 			af_family_kern_slock_keys + sk->sk_family,
1782 			af_family_kern_key_strings[sk->sk_family],
1783 			af_family_kern_keys + sk->sk_family);
1784 	else
1785 		sock_lock_init_class_and_name(
1786 			sk,
1787 			af_family_slock_key_strings[sk->sk_family],
1788 			af_family_slock_keys + sk->sk_family,
1789 			af_family_key_strings[sk->sk_family],
1790 			af_family_keys + sk->sk_family);
1791 }
1792 
1793 /*
1794  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1795  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1796  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1797  */
sock_copy(struct sock * nsk,const struct sock * osk)1798 static void sock_copy(struct sock *nsk, const struct sock *osk)
1799 {
1800 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1801 #ifdef CONFIG_SECURITY_NETWORK
1802 	void *sptr = nsk->sk_security;
1803 #endif
1804 
1805 	/* If we move sk_tx_queue_mapping out of the private section,
1806 	 * we must check if sk_tx_queue_clear() is called after
1807 	 * sock_copy() in sk_clone_lock().
1808 	 */
1809 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1810 		     offsetof(struct sock, sk_dontcopy_begin) ||
1811 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1812 		     offsetof(struct sock, sk_dontcopy_end));
1813 
1814 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1815 
1816 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1817 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1818 
1819 #ifdef CONFIG_SECURITY_NETWORK
1820 	nsk->sk_security = sptr;
1821 	security_sk_clone(osk, nsk);
1822 #endif
1823 }
1824 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1825 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1826 		int family)
1827 {
1828 	struct sock *sk;
1829 	struct kmem_cache *slab;
1830 
1831 	slab = prot->slab;
1832 	if (slab != NULL) {
1833 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1834 		if (!sk)
1835 			return sk;
1836 		if (want_init_on_alloc(priority))
1837 			sk_prot_clear_nulls(sk, prot->obj_size);
1838 	} else
1839 		sk = kmalloc(prot->obj_size, priority);
1840 
1841 	if (sk != NULL) {
1842 		if (security_sk_alloc(sk, family, priority))
1843 			goto out_free;
1844 
1845 		if (!try_module_get(prot->owner))
1846 			goto out_free_sec;
1847 	}
1848 
1849 	return sk;
1850 
1851 out_free_sec:
1852 	security_sk_free(sk);
1853 out_free:
1854 	if (slab != NULL)
1855 		kmem_cache_free(slab, sk);
1856 	else
1857 		kfree(sk);
1858 	return NULL;
1859 }
1860 
sk_prot_free(struct proto * prot,struct sock * sk)1861 static void sk_prot_free(struct proto *prot, struct sock *sk)
1862 {
1863 	struct kmem_cache *slab;
1864 	struct module *owner;
1865 
1866 	owner = prot->owner;
1867 	slab = prot->slab;
1868 
1869 	cgroup_sk_free(&sk->sk_cgrp_data);
1870 	mem_cgroup_sk_free(sk);
1871 	security_sk_free(sk);
1872 	if (slab != NULL)
1873 		kmem_cache_free(slab, sk);
1874 	else
1875 		kfree(sk);
1876 	module_put(owner);
1877 }
1878 
1879 /**
1880  *	sk_alloc - All socket objects are allocated here
1881  *	@net: the applicable net namespace
1882  *	@family: protocol family
1883  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1884  *	@prot: struct proto associated with this new sock instance
1885  *	@kern: is this to be a kernel socket?
1886  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1887 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1888 		      struct proto *prot, int kern)
1889 {
1890 	struct sock *sk;
1891 
1892 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1893 	if (sk) {
1894 		sk->sk_family = family;
1895 		/*
1896 		 * See comment in struct sock definition to understand
1897 		 * why we need sk_prot_creator -acme
1898 		 */
1899 		sk->sk_prot = sk->sk_prot_creator = prot;
1900 		sk->sk_kern_sock = kern;
1901 		sock_lock_init(sk);
1902 		sk->sk_net_refcnt = kern ? 0 : 1;
1903 		if (likely(sk->sk_net_refcnt)) {
1904 			get_net(net);
1905 			sock_inuse_add(net, 1);
1906 		}
1907 
1908 		sock_net_set(sk, net);
1909 		refcount_set(&sk->sk_wmem_alloc, 1);
1910 
1911 		mem_cgroup_sk_alloc(sk);
1912 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1913 		sock_update_classid(&sk->sk_cgrp_data);
1914 		sock_update_netprioidx(&sk->sk_cgrp_data);
1915 		sk_tx_queue_clear(sk);
1916 	}
1917 
1918 	return sk;
1919 }
1920 EXPORT_SYMBOL(sk_alloc);
1921 
1922 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1923  * grace period. This is the case for UDP sockets and TCP listeners.
1924  */
__sk_destruct(struct rcu_head * head)1925 static void __sk_destruct(struct rcu_head *head)
1926 {
1927 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1928 	struct sk_filter *filter;
1929 
1930 	if (sk->sk_destruct)
1931 		sk->sk_destruct(sk);
1932 
1933 	filter = rcu_dereference_check(sk->sk_filter,
1934 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1935 	if (filter) {
1936 		sk_filter_uncharge(sk, filter);
1937 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1938 	}
1939 
1940 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1941 
1942 #ifdef CONFIG_BPF_SYSCALL
1943 	bpf_sk_storage_free(sk);
1944 #endif
1945 
1946 	if (atomic_read(&sk->sk_omem_alloc))
1947 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1948 			 __func__, atomic_read(&sk->sk_omem_alloc));
1949 
1950 	if (sk->sk_frag.page) {
1951 		put_page(sk->sk_frag.page);
1952 		sk->sk_frag.page = NULL;
1953 	}
1954 
1955 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1956 	put_cred(sk->sk_peer_cred);
1957 	put_pid(sk->sk_peer_pid);
1958 
1959 	if (likely(sk->sk_net_refcnt))
1960 		put_net(sock_net(sk));
1961 	sk_prot_free(sk->sk_prot_creator, sk);
1962 }
1963 
sk_destruct(struct sock * sk)1964 void sk_destruct(struct sock *sk)
1965 {
1966 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1967 
1968 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1969 		reuseport_detach_sock(sk);
1970 		use_call_rcu = true;
1971 	}
1972 
1973 	if (use_call_rcu)
1974 		call_rcu(&sk->sk_rcu, __sk_destruct);
1975 	else
1976 		__sk_destruct(&sk->sk_rcu);
1977 }
1978 
__sk_free(struct sock * sk)1979 static void __sk_free(struct sock *sk)
1980 {
1981 	if (likely(sk->sk_net_refcnt))
1982 		sock_inuse_add(sock_net(sk), -1);
1983 
1984 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1985 		sock_diag_broadcast_destroy(sk);
1986 	else
1987 		sk_destruct(sk);
1988 }
1989 
sk_free(struct sock * sk)1990 void sk_free(struct sock *sk)
1991 {
1992 	/*
1993 	 * We subtract one from sk_wmem_alloc and can know if
1994 	 * some packets are still in some tx queue.
1995 	 * If not null, sock_wfree() will call __sk_free(sk) later
1996 	 */
1997 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1998 		__sk_free(sk);
1999 }
2000 EXPORT_SYMBOL(sk_free);
2001 
sk_init_common(struct sock * sk)2002 static void sk_init_common(struct sock *sk)
2003 {
2004 	skb_queue_head_init(&sk->sk_receive_queue);
2005 	skb_queue_head_init(&sk->sk_write_queue);
2006 	skb_queue_head_init(&sk->sk_error_queue);
2007 
2008 	rwlock_init(&sk->sk_callback_lock);
2009 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2010 			af_rlock_keys + sk->sk_family,
2011 			af_family_rlock_key_strings[sk->sk_family]);
2012 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2013 			af_wlock_keys + sk->sk_family,
2014 			af_family_wlock_key_strings[sk->sk_family]);
2015 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2016 			af_elock_keys + sk->sk_family,
2017 			af_family_elock_key_strings[sk->sk_family]);
2018 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2019 			af_callback_keys + sk->sk_family,
2020 			af_family_clock_key_strings[sk->sk_family]);
2021 }
2022 
2023 /**
2024  *	sk_clone_lock - clone a socket, and lock its clone
2025  *	@sk: the socket to clone
2026  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2027  *
2028  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2029  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2030 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2031 {
2032 	struct proto *prot = READ_ONCE(sk->sk_prot);
2033 	struct sk_filter *filter;
2034 	bool is_charged = true;
2035 	struct sock *newsk;
2036 
2037 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2038 	if (!newsk)
2039 		goto out;
2040 
2041 	sock_copy(newsk, sk);
2042 
2043 	newsk->sk_prot_creator = prot;
2044 
2045 	/* SANITY */
2046 	if (likely(newsk->sk_net_refcnt))
2047 		get_net(sock_net(newsk));
2048 	sk_node_init(&newsk->sk_node);
2049 	sock_lock_init(newsk);
2050 	bh_lock_sock(newsk);
2051 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2052 	newsk->sk_backlog.len = 0;
2053 
2054 	atomic_set(&newsk->sk_rmem_alloc, 0);
2055 
2056 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2057 	refcount_set(&newsk->sk_wmem_alloc, 1);
2058 
2059 	atomic_set(&newsk->sk_omem_alloc, 0);
2060 	sk_init_common(newsk);
2061 
2062 	newsk->sk_dst_cache	= NULL;
2063 	newsk->sk_dst_pending_confirm = 0;
2064 	newsk->sk_wmem_queued	= 0;
2065 	newsk->sk_forward_alloc = 0;
2066 	atomic_set(&newsk->sk_drops, 0);
2067 	newsk->sk_send_head	= NULL;
2068 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2069 	atomic_set(&newsk->sk_zckey, 0);
2070 
2071 	sock_reset_flag(newsk, SOCK_DONE);
2072 
2073 	/* sk->sk_memcg will be populated at accept() time */
2074 	newsk->sk_memcg = NULL;
2075 
2076 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2077 
2078 	rcu_read_lock();
2079 	filter = rcu_dereference(sk->sk_filter);
2080 	if (filter != NULL)
2081 		/* though it's an empty new sock, the charging may fail
2082 		 * if sysctl_optmem_max was changed between creation of
2083 		 * original socket and cloning
2084 		 */
2085 		is_charged = sk_filter_charge(newsk, filter);
2086 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2087 	rcu_read_unlock();
2088 
2089 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2090 		/* We need to make sure that we don't uncharge the new
2091 		 * socket if we couldn't charge it in the first place
2092 		 * as otherwise we uncharge the parent's filter.
2093 		 */
2094 		if (!is_charged)
2095 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2096 		sk_free_unlock_clone(newsk);
2097 		newsk = NULL;
2098 		goto out;
2099 	}
2100 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2101 
2102 	if (bpf_sk_storage_clone(sk, newsk)) {
2103 		sk_free_unlock_clone(newsk);
2104 		newsk = NULL;
2105 		goto out;
2106 	}
2107 
2108 	/* Clear sk_user_data if parent had the pointer tagged
2109 	 * as not suitable for copying when cloning.
2110 	 */
2111 	if (sk_user_data_is_nocopy(newsk))
2112 		newsk->sk_user_data = NULL;
2113 
2114 	newsk->sk_err	   = 0;
2115 	newsk->sk_err_soft = 0;
2116 	newsk->sk_priority = 0;
2117 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2118 	if (likely(newsk->sk_net_refcnt))
2119 		sock_inuse_add(sock_net(newsk), 1);
2120 
2121 	/* Before updating sk_refcnt, we must commit prior changes to memory
2122 	 * (Documentation/RCU/rculist_nulls.rst for details)
2123 	 */
2124 	smp_wmb();
2125 	refcount_set(&newsk->sk_refcnt, 2);
2126 
2127 	/* Increment the counter in the same struct proto as the master
2128 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2129 	 * is the same as sk->sk_prot->socks, as this field was copied
2130 	 * with memcpy).
2131 	 *
2132 	 * This _changes_ the previous behaviour, where
2133 	 * tcp_create_openreq_child always was incrementing the
2134 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2135 	 * to be taken into account in all callers. -acme
2136 	 */
2137 	sk_refcnt_debug_inc(newsk);
2138 	sk_set_socket(newsk, NULL);
2139 	sk_tx_queue_clear(newsk);
2140 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2141 
2142 	if (newsk->sk_prot->sockets_allocated)
2143 		sk_sockets_allocated_inc(newsk);
2144 
2145 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2146 		net_enable_timestamp();
2147 out:
2148 	return newsk;
2149 }
2150 EXPORT_SYMBOL_GPL(sk_clone_lock);
2151 
sk_free_unlock_clone(struct sock * sk)2152 void sk_free_unlock_clone(struct sock *sk)
2153 {
2154 	/* It is still raw copy of parent, so invalidate
2155 	 * destructor and make plain sk_free() */
2156 	sk->sk_destruct = NULL;
2157 	bh_unlock_sock(sk);
2158 	sk_free(sk);
2159 }
2160 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2161 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2162 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2163 {
2164 	u32 max_segs = 1;
2165 
2166 	sk_dst_set(sk, dst);
2167 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2168 	if (sk->sk_route_caps & NETIF_F_GSO)
2169 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2170 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2171 	if (sk_can_gso(sk)) {
2172 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2173 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2174 		} else {
2175 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2176 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2177 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2178 		}
2179 	}
2180 	sk->sk_gso_max_segs = max_segs;
2181 }
2182 EXPORT_SYMBOL_GPL(sk_setup_caps);
2183 
2184 /*
2185  *	Simple resource managers for sockets.
2186  */
2187 
2188 
2189 /*
2190  * Write buffer destructor automatically called from kfree_skb.
2191  */
sock_wfree(struct sk_buff * skb)2192 void sock_wfree(struct sk_buff *skb)
2193 {
2194 	struct sock *sk = skb->sk;
2195 	unsigned int len = skb->truesize;
2196 
2197 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2198 		/*
2199 		 * Keep a reference on sk_wmem_alloc, this will be released
2200 		 * after sk_write_space() call
2201 		 */
2202 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2203 		sk->sk_write_space(sk);
2204 		len = 1;
2205 	}
2206 	/*
2207 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2208 	 * could not do because of in-flight packets
2209 	 */
2210 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2211 		__sk_free(sk);
2212 }
2213 EXPORT_SYMBOL(sock_wfree);
2214 
2215 /* This variant of sock_wfree() is used by TCP,
2216  * since it sets SOCK_USE_WRITE_QUEUE.
2217  */
__sock_wfree(struct sk_buff * skb)2218 void __sock_wfree(struct sk_buff *skb)
2219 {
2220 	struct sock *sk = skb->sk;
2221 
2222 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2223 		__sk_free(sk);
2224 }
2225 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2226 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2227 {
2228 	skb_orphan(skb);
2229 	skb->sk = sk;
2230 #ifdef CONFIG_INET
2231 	if (unlikely(!sk_fullsock(sk))) {
2232 		skb->destructor = sock_edemux;
2233 		sock_hold(sk);
2234 		return;
2235 	}
2236 #endif
2237 	skb->destructor = sock_wfree;
2238 	skb_set_hash_from_sk(skb, sk);
2239 	/*
2240 	 * We used to take a refcount on sk, but following operation
2241 	 * is enough to guarantee sk_free() wont free this sock until
2242 	 * all in-flight packets are completed
2243 	 */
2244 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2245 }
2246 EXPORT_SYMBOL(skb_set_owner_w);
2247 
can_skb_orphan_partial(const struct sk_buff * skb)2248 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2249 {
2250 #ifdef CONFIG_TLS_DEVICE
2251 	/* Drivers depend on in-order delivery for crypto offload,
2252 	 * partial orphan breaks out-of-order-OK logic.
2253 	 */
2254 	if (skb->decrypted)
2255 		return false;
2256 #endif
2257 	return (skb->destructor == sock_wfree ||
2258 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2259 }
2260 
2261 /* This helper is used by netem, as it can hold packets in its
2262  * delay queue. We want to allow the owner socket to send more
2263  * packets, as if they were already TX completed by a typical driver.
2264  * But we also want to keep skb->sk set because some packet schedulers
2265  * rely on it (sch_fq for example).
2266  */
skb_orphan_partial(struct sk_buff * skb)2267 void skb_orphan_partial(struct sk_buff *skb)
2268 {
2269 	if (skb_is_tcp_pure_ack(skb))
2270 		return;
2271 
2272 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2273 		return;
2274 
2275 	skb_orphan(skb);
2276 }
2277 EXPORT_SYMBOL(skb_orphan_partial);
2278 
2279 /*
2280  * Read buffer destructor automatically called from kfree_skb.
2281  */
sock_rfree(struct sk_buff * skb)2282 void sock_rfree(struct sk_buff *skb)
2283 {
2284 	struct sock *sk = skb->sk;
2285 	unsigned int len = skb->truesize;
2286 
2287 	atomic_sub(len, &sk->sk_rmem_alloc);
2288 	sk_mem_uncharge(sk, len);
2289 }
2290 EXPORT_SYMBOL(sock_rfree);
2291 
2292 /*
2293  * Buffer destructor for skbs that are not used directly in read or write
2294  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2295  */
sock_efree(struct sk_buff * skb)2296 void sock_efree(struct sk_buff *skb)
2297 {
2298 	sock_put(skb->sk);
2299 }
2300 EXPORT_SYMBOL(sock_efree);
2301 
2302 /* Buffer destructor for prefetch/receive path where reference count may
2303  * not be held, e.g. for listen sockets.
2304  */
2305 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2306 void sock_pfree(struct sk_buff *skb)
2307 {
2308 	if (sk_is_refcounted(skb->sk))
2309 		sock_gen_put(skb->sk);
2310 }
2311 EXPORT_SYMBOL(sock_pfree);
2312 #endif /* CONFIG_INET */
2313 
sock_i_uid(struct sock * sk)2314 kuid_t sock_i_uid(struct sock *sk)
2315 {
2316 	kuid_t uid;
2317 
2318 	read_lock_bh(&sk->sk_callback_lock);
2319 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2320 	read_unlock_bh(&sk->sk_callback_lock);
2321 	return uid;
2322 }
2323 EXPORT_SYMBOL(sock_i_uid);
2324 
sock_i_ino(struct sock * sk)2325 unsigned long sock_i_ino(struct sock *sk)
2326 {
2327 	unsigned long ino;
2328 
2329 	read_lock_bh(&sk->sk_callback_lock);
2330 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2331 	read_unlock_bh(&sk->sk_callback_lock);
2332 	return ino;
2333 }
2334 EXPORT_SYMBOL(sock_i_ino);
2335 
2336 /*
2337  * Allocate a skb from the socket's send buffer.
2338  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2339 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2340 			     gfp_t priority)
2341 {
2342 	if (force ||
2343 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2344 		struct sk_buff *skb = alloc_skb(size, priority);
2345 
2346 		if (skb) {
2347 			skb_set_owner_w(skb, sk);
2348 			return skb;
2349 		}
2350 	}
2351 	return NULL;
2352 }
2353 EXPORT_SYMBOL(sock_wmalloc);
2354 
sock_ofree(struct sk_buff * skb)2355 static void sock_ofree(struct sk_buff *skb)
2356 {
2357 	struct sock *sk = skb->sk;
2358 
2359 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2360 }
2361 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2362 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2363 			     gfp_t priority)
2364 {
2365 	struct sk_buff *skb;
2366 
2367 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2368 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2369 	    sysctl_optmem_max)
2370 		return NULL;
2371 
2372 	skb = alloc_skb(size, priority);
2373 	if (!skb)
2374 		return NULL;
2375 
2376 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2377 	skb->sk = sk;
2378 	skb->destructor = sock_ofree;
2379 	return skb;
2380 }
2381 
2382 /*
2383  * Allocate a memory block from the socket's option memory buffer.
2384  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2385 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2386 {
2387 	if ((unsigned int)size <= sysctl_optmem_max &&
2388 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2389 		void *mem;
2390 		/* First do the add, to avoid the race if kmalloc
2391 		 * might sleep.
2392 		 */
2393 		atomic_add(size, &sk->sk_omem_alloc);
2394 		mem = kmalloc(size, priority);
2395 		if (mem)
2396 			return mem;
2397 		atomic_sub(size, &sk->sk_omem_alloc);
2398 	}
2399 	return NULL;
2400 }
2401 EXPORT_SYMBOL(sock_kmalloc);
2402 
2403 /* Free an option memory block. Note, we actually want the inline
2404  * here as this allows gcc to detect the nullify and fold away the
2405  * condition entirely.
2406  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2407 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2408 				  const bool nullify)
2409 {
2410 	if (WARN_ON_ONCE(!mem))
2411 		return;
2412 	if (nullify)
2413 		kfree_sensitive(mem);
2414 	else
2415 		kfree(mem);
2416 	atomic_sub(size, &sk->sk_omem_alloc);
2417 }
2418 
sock_kfree_s(struct sock * sk,void * mem,int size)2419 void sock_kfree_s(struct sock *sk, void *mem, int size)
2420 {
2421 	__sock_kfree_s(sk, mem, size, false);
2422 }
2423 EXPORT_SYMBOL(sock_kfree_s);
2424 
sock_kzfree_s(struct sock * sk,void * mem,int size)2425 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2426 {
2427 	__sock_kfree_s(sk, mem, size, true);
2428 }
2429 EXPORT_SYMBOL(sock_kzfree_s);
2430 
2431 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2432    I think, these locks should be removed for datagram sockets.
2433  */
sock_wait_for_wmem(struct sock * sk,long timeo)2434 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2435 {
2436 	DEFINE_WAIT(wait);
2437 
2438 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2439 	for (;;) {
2440 		if (!timeo)
2441 			break;
2442 		if (signal_pending(current))
2443 			break;
2444 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2445 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2446 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2447 			break;
2448 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2449 			break;
2450 		if (sk->sk_err)
2451 			break;
2452 		timeo = schedule_timeout(timeo);
2453 	}
2454 	finish_wait(sk_sleep(sk), &wait);
2455 	return timeo;
2456 }
2457 
2458 
2459 /*
2460  *	Generic send/receive buffer handlers
2461  */
2462 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2463 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2464 				     unsigned long data_len, int noblock,
2465 				     int *errcode, int max_page_order)
2466 {
2467 	struct sk_buff *skb;
2468 	long timeo;
2469 	int err;
2470 
2471 	timeo = sock_sndtimeo(sk, noblock);
2472 	for (;;) {
2473 		err = sock_error(sk);
2474 		if (err != 0)
2475 			goto failure;
2476 
2477 		err = -EPIPE;
2478 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2479 			goto failure;
2480 
2481 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2482 			break;
2483 
2484 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2485 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2486 		err = -EAGAIN;
2487 		if (!timeo)
2488 			goto failure;
2489 		if (signal_pending(current))
2490 			goto interrupted;
2491 		timeo = sock_wait_for_wmem(sk, timeo);
2492 	}
2493 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2494 				   errcode, sk->sk_allocation);
2495 	if (skb)
2496 		skb_set_owner_w(skb, sk);
2497 	return skb;
2498 
2499 interrupted:
2500 	err = sock_intr_errno(timeo);
2501 failure:
2502 	*errcode = err;
2503 	return NULL;
2504 }
2505 EXPORT_SYMBOL(sock_alloc_send_pskb);
2506 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2507 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2508 				    int noblock, int *errcode)
2509 {
2510 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2511 }
2512 EXPORT_SYMBOL(sock_alloc_send_skb);
2513 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2514 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2515 		     struct sockcm_cookie *sockc)
2516 {
2517 	u32 tsflags;
2518 
2519 	switch (cmsg->cmsg_type) {
2520 	case SO_MARK:
2521 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2522 			return -EPERM;
2523 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2524 			return -EINVAL;
2525 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2526 		break;
2527 	case SO_TIMESTAMPING_OLD:
2528 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2529 			return -EINVAL;
2530 
2531 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2532 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2533 			return -EINVAL;
2534 
2535 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2536 		sockc->tsflags |= tsflags;
2537 		break;
2538 	case SCM_TXTIME:
2539 		if (!sock_flag(sk, SOCK_TXTIME))
2540 			return -EINVAL;
2541 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2542 			return -EINVAL;
2543 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2544 		break;
2545 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2546 	case SCM_RIGHTS:
2547 	case SCM_CREDENTIALS:
2548 		break;
2549 	default:
2550 		return -EINVAL;
2551 	}
2552 	return 0;
2553 }
2554 EXPORT_SYMBOL(__sock_cmsg_send);
2555 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2556 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2557 		   struct sockcm_cookie *sockc)
2558 {
2559 	struct cmsghdr *cmsg;
2560 	int ret;
2561 
2562 	for_each_cmsghdr(cmsg, msg) {
2563 		if (!CMSG_OK(msg, cmsg))
2564 			return -EINVAL;
2565 		if (cmsg->cmsg_level != SOL_SOCKET)
2566 			continue;
2567 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2568 		if (ret)
2569 			return ret;
2570 	}
2571 	return 0;
2572 }
2573 EXPORT_SYMBOL(sock_cmsg_send);
2574 
sk_enter_memory_pressure(struct sock * sk)2575 static void sk_enter_memory_pressure(struct sock *sk)
2576 {
2577 	if (!sk->sk_prot->enter_memory_pressure)
2578 		return;
2579 
2580 	sk->sk_prot->enter_memory_pressure(sk);
2581 }
2582 
sk_leave_memory_pressure(struct sock * sk)2583 static void sk_leave_memory_pressure(struct sock *sk)
2584 {
2585 	if (sk->sk_prot->leave_memory_pressure) {
2586 		sk->sk_prot->leave_memory_pressure(sk);
2587 	} else {
2588 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2589 
2590 		if (memory_pressure && READ_ONCE(*memory_pressure))
2591 			WRITE_ONCE(*memory_pressure, 0);
2592 	}
2593 }
2594 
2595 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2596 
2597 /**
2598  * skb_page_frag_refill - check that a page_frag contains enough room
2599  * @sz: minimum size of the fragment we want to get
2600  * @pfrag: pointer to page_frag
2601  * @gfp: priority for memory allocation
2602  *
2603  * Note: While this allocator tries to use high order pages, there is
2604  * no guarantee that allocations succeed. Therefore, @sz MUST be
2605  * less or equal than PAGE_SIZE.
2606  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2607 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2608 {
2609 	if (pfrag->page) {
2610 		if (page_ref_count(pfrag->page) == 1) {
2611 			pfrag->offset = 0;
2612 			return true;
2613 		}
2614 		if (pfrag->offset + sz <= pfrag->size)
2615 			return true;
2616 		put_page(pfrag->page);
2617 	}
2618 
2619 	pfrag->offset = 0;
2620 	if (SKB_FRAG_PAGE_ORDER &&
2621 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2622 		/* Avoid direct reclaim but allow kswapd to wake */
2623 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2624 					  __GFP_COMP | __GFP_NOWARN |
2625 					  __GFP_NORETRY,
2626 					  SKB_FRAG_PAGE_ORDER);
2627 		if (likely(pfrag->page)) {
2628 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2629 			return true;
2630 		}
2631 	}
2632 	pfrag->page = alloc_page(gfp);
2633 	if (likely(pfrag->page)) {
2634 		pfrag->size = PAGE_SIZE;
2635 		return true;
2636 	}
2637 	return false;
2638 }
2639 EXPORT_SYMBOL(skb_page_frag_refill);
2640 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2641 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2642 {
2643 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2644 		return true;
2645 
2646 	sk_enter_memory_pressure(sk);
2647 	sk_stream_moderate_sndbuf(sk);
2648 	return false;
2649 }
2650 EXPORT_SYMBOL(sk_page_frag_refill);
2651 
__lock_sock(struct sock * sk)2652 void __lock_sock(struct sock *sk)
2653 	__releases(&sk->sk_lock.slock)
2654 	__acquires(&sk->sk_lock.slock)
2655 {
2656 	DEFINE_WAIT(wait);
2657 
2658 	for (;;) {
2659 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2660 					TASK_UNINTERRUPTIBLE);
2661 		spin_unlock_bh(&sk->sk_lock.slock);
2662 		schedule();
2663 		spin_lock_bh(&sk->sk_lock.slock);
2664 		if (!sock_owned_by_user(sk))
2665 			break;
2666 	}
2667 	finish_wait(&sk->sk_lock.wq, &wait);
2668 }
2669 
__release_sock(struct sock * sk)2670 void __release_sock(struct sock *sk)
2671 	__releases(&sk->sk_lock.slock)
2672 	__acquires(&sk->sk_lock.slock)
2673 {
2674 	struct sk_buff *skb, *next;
2675 
2676 	while ((skb = sk->sk_backlog.head) != NULL) {
2677 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2678 
2679 		spin_unlock_bh(&sk->sk_lock.slock);
2680 
2681 		do {
2682 			next = skb->next;
2683 			prefetch(next);
2684 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2685 			skb_mark_not_on_list(skb);
2686 			sk_backlog_rcv(sk, skb);
2687 
2688 			cond_resched();
2689 
2690 			skb = next;
2691 		} while (skb != NULL);
2692 
2693 		spin_lock_bh(&sk->sk_lock.slock);
2694 	}
2695 
2696 	/*
2697 	 * Doing the zeroing here guarantee we can not loop forever
2698 	 * while a wild producer attempts to flood us.
2699 	 */
2700 	sk->sk_backlog.len = 0;
2701 }
2702 
__sk_flush_backlog(struct sock * sk)2703 void __sk_flush_backlog(struct sock *sk)
2704 {
2705 	spin_lock_bh(&sk->sk_lock.slock);
2706 	__release_sock(sk);
2707 	spin_unlock_bh(&sk->sk_lock.slock);
2708 }
2709 
2710 /**
2711  * sk_wait_data - wait for data to arrive at sk_receive_queue
2712  * @sk:    sock to wait on
2713  * @timeo: for how long
2714  * @skb:   last skb seen on sk_receive_queue
2715  *
2716  * Now socket state including sk->sk_err is changed only under lock,
2717  * hence we may omit checks after joining wait queue.
2718  * We check receive queue before schedule() only as optimization;
2719  * it is very likely that release_sock() added new data.
2720  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2721 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2722 {
2723 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2724 	int rc;
2725 
2726 	add_wait_queue(sk_sleep(sk), &wait);
2727 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2728 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2729 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2730 	remove_wait_queue(sk_sleep(sk), &wait);
2731 	return rc;
2732 }
2733 EXPORT_SYMBOL(sk_wait_data);
2734 
2735 /**
2736  *	__sk_mem_raise_allocated - increase memory_allocated
2737  *	@sk: socket
2738  *	@size: memory size to allocate
2739  *	@amt: pages to allocate
2740  *	@kind: allocation type
2741  *
2742  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2743  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2744 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2745 {
2746 	struct proto *prot = sk->sk_prot;
2747 	long allocated = sk_memory_allocated_add(sk, amt);
2748 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2749 	bool charged = true;
2750 
2751 	if (memcg_charge &&
2752 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2753 						gfp_memcg_charge())))
2754 		goto suppress_allocation;
2755 
2756 	/* Under limit. */
2757 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2758 		sk_leave_memory_pressure(sk);
2759 		return 1;
2760 	}
2761 
2762 	/* Under pressure. */
2763 	if (allocated > sk_prot_mem_limits(sk, 1))
2764 		sk_enter_memory_pressure(sk);
2765 
2766 	/* Over hard limit. */
2767 	if (allocated > sk_prot_mem_limits(sk, 2))
2768 		goto suppress_allocation;
2769 
2770 	/* guarantee minimum buffer size under pressure */
2771 	if (kind == SK_MEM_RECV) {
2772 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2773 			return 1;
2774 
2775 	} else { /* SK_MEM_SEND */
2776 		int wmem0 = sk_get_wmem0(sk, prot);
2777 
2778 		if (sk->sk_type == SOCK_STREAM) {
2779 			if (sk->sk_wmem_queued < wmem0)
2780 				return 1;
2781 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2782 				return 1;
2783 		}
2784 	}
2785 
2786 	if (sk_has_memory_pressure(sk)) {
2787 		u64 alloc;
2788 
2789 		if (!sk_under_memory_pressure(sk))
2790 			return 1;
2791 		alloc = sk_sockets_allocated_read_positive(sk);
2792 		if (sk_prot_mem_limits(sk, 2) > alloc *
2793 		    sk_mem_pages(sk->sk_wmem_queued +
2794 				 atomic_read(&sk->sk_rmem_alloc) +
2795 				 sk->sk_forward_alloc))
2796 			return 1;
2797 	}
2798 
2799 suppress_allocation:
2800 
2801 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2802 		sk_stream_moderate_sndbuf(sk);
2803 
2804 		/* Fail only if socket is _under_ its sndbuf.
2805 		 * In this case we cannot block, so that we have to fail.
2806 		 */
2807 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2808 			/* Force charge with __GFP_NOFAIL */
2809 			if (memcg_charge && !charged) {
2810 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2811 					gfp_memcg_charge() | __GFP_NOFAIL);
2812 			}
2813 			return 1;
2814 		}
2815 	}
2816 
2817 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2818 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2819 
2820 	sk_memory_allocated_sub(sk, amt);
2821 
2822 	if (memcg_charge && charged)
2823 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2824 
2825 	return 0;
2826 }
2827 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2828 
2829 /**
2830  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2831  *	@sk: socket
2832  *	@size: memory size to allocate
2833  *	@kind: allocation type
2834  *
2835  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2836  *	rmem allocation. This function assumes that protocols which have
2837  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2838  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2839 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2840 {
2841 	int ret, amt = sk_mem_pages(size);
2842 
2843 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2844 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2845 	if (!ret)
2846 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2847 	return ret;
2848 }
2849 EXPORT_SYMBOL(__sk_mem_schedule);
2850 
2851 /**
2852  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2853  *	@sk: socket
2854  *	@amount: number of quanta
2855  *
2856  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2857  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2858 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2859 {
2860 	sk_memory_allocated_sub(sk, amount);
2861 
2862 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2863 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2864 
2865 	if (sk_under_memory_pressure(sk) &&
2866 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2867 		sk_leave_memory_pressure(sk);
2868 }
2869 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2870 
2871 /**
2872  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2873  *	@sk: socket
2874  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2875  */
__sk_mem_reclaim(struct sock * sk,int amount)2876 void __sk_mem_reclaim(struct sock *sk, int amount)
2877 {
2878 	amount >>= SK_MEM_QUANTUM_SHIFT;
2879 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2880 	__sk_mem_reduce_allocated(sk, amount);
2881 }
2882 EXPORT_SYMBOL(__sk_mem_reclaim);
2883 
sk_set_peek_off(struct sock * sk,int val)2884 int sk_set_peek_off(struct sock *sk, int val)
2885 {
2886 	sk->sk_peek_off = val;
2887 	return 0;
2888 }
2889 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2890 
2891 /*
2892  * Set of default routines for initialising struct proto_ops when
2893  * the protocol does not support a particular function. In certain
2894  * cases where it makes no sense for a protocol to have a "do nothing"
2895  * function, some default processing is provided.
2896  */
2897 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2898 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2899 {
2900 	return -EOPNOTSUPP;
2901 }
2902 EXPORT_SYMBOL(sock_no_bind);
2903 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2904 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2905 		    int len, int flags)
2906 {
2907 	return -EOPNOTSUPP;
2908 }
2909 EXPORT_SYMBOL(sock_no_connect);
2910 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2911 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2912 {
2913 	return -EOPNOTSUPP;
2914 }
2915 EXPORT_SYMBOL(sock_no_socketpair);
2916 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2917 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2918 		   bool kern)
2919 {
2920 	return -EOPNOTSUPP;
2921 }
2922 EXPORT_SYMBOL(sock_no_accept);
2923 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2924 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2925 		    int peer)
2926 {
2927 	return -EOPNOTSUPP;
2928 }
2929 EXPORT_SYMBOL(sock_no_getname);
2930 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2931 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2932 {
2933 	return -EOPNOTSUPP;
2934 }
2935 EXPORT_SYMBOL(sock_no_ioctl);
2936 
sock_no_listen(struct socket * sock,int backlog)2937 int sock_no_listen(struct socket *sock, int backlog)
2938 {
2939 	return -EOPNOTSUPP;
2940 }
2941 EXPORT_SYMBOL(sock_no_listen);
2942 
sock_no_shutdown(struct socket * sock,int how)2943 int sock_no_shutdown(struct socket *sock, int how)
2944 {
2945 	return -EOPNOTSUPP;
2946 }
2947 EXPORT_SYMBOL(sock_no_shutdown);
2948 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2949 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2950 {
2951 	return -EOPNOTSUPP;
2952 }
2953 EXPORT_SYMBOL(sock_no_sendmsg);
2954 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2955 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2956 {
2957 	return -EOPNOTSUPP;
2958 }
2959 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2960 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2961 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2962 		    int flags)
2963 {
2964 	return -EOPNOTSUPP;
2965 }
2966 EXPORT_SYMBOL(sock_no_recvmsg);
2967 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2968 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2969 {
2970 	/* Mirror missing mmap method error code */
2971 	return -ENODEV;
2972 }
2973 EXPORT_SYMBOL(sock_no_mmap);
2974 
2975 /*
2976  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2977  * various sock-based usage counts.
2978  */
__receive_sock(struct file * file)2979 void __receive_sock(struct file *file)
2980 {
2981 	struct socket *sock;
2982 
2983 	sock = sock_from_file(file);
2984 	if (sock) {
2985 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2986 		sock_update_classid(&sock->sk->sk_cgrp_data);
2987 	}
2988 }
2989 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2990 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2991 {
2992 	ssize_t res;
2993 	struct msghdr msg = {.msg_flags = flags};
2994 	struct kvec iov;
2995 	char *kaddr = kmap(page);
2996 	iov.iov_base = kaddr + offset;
2997 	iov.iov_len = size;
2998 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2999 	kunmap(page);
3000 	return res;
3001 }
3002 EXPORT_SYMBOL(sock_no_sendpage);
3003 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3004 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3005 				int offset, size_t size, int flags)
3006 {
3007 	ssize_t res;
3008 	struct msghdr msg = {.msg_flags = flags};
3009 	struct kvec iov;
3010 	char *kaddr = kmap(page);
3011 
3012 	iov.iov_base = kaddr + offset;
3013 	iov.iov_len = size;
3014 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3015 	kunmap(page);
3016 	return res;
3017 }
3018 EXPORT_SYMBOL(sock_no_sendpage_locked);
3019 
3020 /*
3021  *	Default Socket Callbacks
3022  */
3023 
sock_def_wakeup(struct sock * sk)3024 static void sock_def_wakeup(struct sock *sk)
3025 {
3026 	struct socket_wq *wq;
3027 
3028 	rcu_read_lock();
3029 	wq = rcu_dereference(sk->sk_wq);
3030 	if (skwq_has_sleeper(wq))
3031 		wake_up_interruptible_all(&wq->wait);
3032 	rcu_read_unlock();
3033 }
3034 
sock_def_error_report(struct sock * sk)3035 static void sock_def_error_report(struct sock *sk)
3036 {
3037 	struct socket_wq *wq;
3038 
3039 	rcu_read_lock();
3040 	wq = rcu_dereference(sk->sk_wq);
3041 	if (skwq_has_sleeper(wq))
3042 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3043 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3044 	rcu_read_unlock();
3045 }
3046 
sock_def_readable(struct sock * sk)3047 void sock_def_readable(struct sock *sk)
3048 {
3049 	struct socket_wq *wq;
3050 
3051 	rcu_read_lock();
3052 	wq = rcu_dereference(sk->sk_wq);
3053 	if (skwq_has_sleeper(wq))
3054 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3055 						EPOLLRDNORM | EPOLLRDBAND);
3056 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3057 	rcu_read_unlock();
3058 }
3059 
sock_def_write_space(struct sock * sk)3060 static void sock_def_write_space(struct sock *sk)
3061 {
3062 	struct socket_wq *wq;
3063 
3064 	rcu_read_lock();
3065 
3066 	/* Do not wake up a writer until he can make "significant"
3067 	 * progress.  --DaveM
3068 	 */
3069 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3070 		wq = rcu_dereference(sk->sk_wq);
3071 		if (skwq_has_sleeper(wq))
3072 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3073 						EPOLLWRNORM | EPOLLWRBAND);
3074 
3075 		/* Should agree with poll, otherwise some programs break */
3076 		if (sock_writeable(sk))
3077 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3078 	}
3079 
3080 	rcu_read_unlock();
3081 }
3082 
sock_def_destruct(struct sock * sk)3083 static void sock_def_destruct(struct sock *sk)
3084 {
3085 }
3086 
sk_send_sigurg(struct sock * sk)3087 void sk_send_sigurg(struct sock *sk)
3088 {
3089 	if (sk->sk_socket && sk->sk_socket->file)
3090 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3091 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3092 }
3093 EXPORT_SYMBOL(sk_send_sigurg);
3094 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3095 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3096 		    unsigned long expires)
3097 {
3098 	if (!mod_timer(timer, expires))
3099 		sock_hold(sk);
3100 }
3101 EXPORT_SYMBOL(sk_reset_timer);
3102 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3103 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3104 {
3105 	if (del_timer(timer))
3106 		__sock_put(sk);
3107 }
3108 EXPORT_SYMBOL(sk_stop_timer);
3109 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3110 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3111 {
3112 	if (del_timer_sync(timer))
3113 		__sock_put(sk);
3114 }
3115 EXPORT_SYMBOL(sk_stop_timer_sync);
3116 
sock_init_data(struct socket * sock,struct sock * sk)3117 void sock_init_data(struct socket *sock, struct sock *sk)
3118 {
3119 	sk_init_common(sk);
3120 	sk->sk_send_head	=	NULL;
3121 
3122 	timer_setup(&sk->sk_timer, NULL, 0);
3123 
3124 	sk->sk_allocation	=	GFP_KERNEL;
3125 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3126 	sk->sk_sndbuf		=	sysctl_wmem_default;
3127 	sk->sk_state		=	TCP_CLOSE;
3128 	sk_set_socket(sk, sock);
3129 
3130 	sock_set_flag(sk, SOCK_ZAPPED);
3131 
3132 	if (sock) {
3133 		sk->sk_type	=	sock->type;
3134 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3135 		sock->sk	=	sk;
3136 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3137 	} else {
3138 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3139 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3140 	}
3141 
3142 	rwlock_init(&sk->sk_callback_lock);
3143 	if (sk->sk_kern_sock)
3144 		lockdep_set_class_and_name(
3145 			&sk->sk_callback_lock,
3146 			af_kern_callback_keys + sk->sk_family,
3147 			af_family_kern_clock_key_strings[sk->sk_family]);
3148 	else
3149 		lockdep_set_class_and_name(
3150 			&sk->sk_callback_lock,
3151 			af_callback_keys + sk->sk_family,
3152 			af_family_clock_key_strings[sk->sk_family]);
3153 
3154 	sk->sk_state_change	=	sock_def_wakeup;
3155 	sk->sk_data_ready	=	sock_def_readable;
3156 	sk->sk_write_space	=	sock_def_write_space;
3157 	sk->sk_error_report	=	sock_def_error_report;
3158 	sk->sk_destruct		=	sock_def_destruct;
3159 
3160 	sk->sk_frag.page	=	NULL;
3161 	sk->sk_frag.offset	=	0;
3162 	sk->sk_peek_off		=	-1;
3163 
3164 	sk->sk_peer_pid 	=	NULL;
3165 	sk->sk_peer_cred	=	NULL;
3166 	spin_lock_init(&sk->sk_peer_lock);
3167 
3168 	sk->sk_write_pending	=	0;
3169 	sk->sk_rcvlowat		=	1;
3170 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3171 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3172 
3173 	sk->sk_stamp = SK_DEFAULT_STAMP;
3174 #if BITS_PER_LONG==32
3175 	seqlock_init(&sk->sk_stamp_seq);
3176 #endif
3177 	atomic_set(&sk->sk_zckey, 0);
3178 
3179 #ifdef CONFIG_NET_RX_BUSY_POLL
3180 	sk->sk_napi_id		=	0;
3181 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3182 #endif
3183 
3184 	sk->sk_max_pacing_rate = ~0UL;
3185 	sk->sk_pacing_rate = ~0UL;
3186 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3187 	sk->sk_incoming_cpu = -1;
3188 
3189 	sk_rx_queue_clear(sk);
3190 	/*
3191 	 * Before updating sk_refcnt, we must commit prior changes to memory
3192 	 * (Documentation/RCU/rculist_nulls.rst for details)
3193 	 */
3194 	smp_wmb();
3195 	refcount_set(&sk->sk_refcnt, 1);
3196 	atomic_set(&sk->sk_drops, 0);
3197 }
3198 EXPORT_SYMBOL(sock_init_data);
3199 
lock_sock_nested(struct sock * sk,int subclass)3200 void lock_sock_nested(struct sock *sk, int subclass)
3201 {
3202 	/* The sk_lock has mutex_lock() semantics here. */
3203 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3204 
3205 	might_sleep();
3206 	spin_lock_bh(&sk->sk_lock.slock);
3207 	if (sk->sk_lock.owned)
3208 		__lock_sock(sk);
3209 	sk->sk_lock.owned = 1;
3210 	spin_unlock_bh(&sk->sk_lock.slock);
3211 }
3212 EXPORT_SYMBOL(lock_sock_nested);
3213 
release_sock(struct sock * sk)3214 void release_sock(struct sock *sk)
3215 {
3216 	spin_lock_bh(&sk->sk_lock.slock);
3217 	if (sk->sk_backlog.tail)
3218 		__release_sock(sk);
3219 
3220 	/* Warning : release_cb() might need to release sk ownership,
3221 	 * ie call sock_release_ownership(sk) before us.
3222 	 */
3223 	if (sk->sk_prot->release_cb)
3224 		sk->sk_prot->release_cb(sk);
3225 
3226 	sock_release_ownership(sk);
3227 	if (waitqueue_active(&sk->sk_lock.wq))
3228 		wake_up(&sk->sk_lock.wq);
3229 	spin_unlock_bh(&sk->sk_lock.slock);
3230 }
3231 EXPORT_SYMBOL(release_sock);
3232 
__lock_sock_fast(struct sock * sk)3233 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3234 {
3235 	might_sleep();
3236 	spin_lock_bh(&sk->sk_lock.slock);
3237 
3238 	if (!sk->sk_lock.owned) {
3239 		/*
3240 		 * Fast path return with bottom halves disabled and
3241 		 * sock::sk_lock.slock held.
3242 		 *
3243 		 * The 'mutex' is not contended and holding
3244 		 * sock::sk_lock.slock prevents all other lockers to
3245 		 * proceed so the corresponding unlock_sock_fast() can
3246 		 * avoid the slow path of release_sock() completely and
3247 		 * just release slock.
3248 		 *
3249 		 * From a semantical POV this is equivalent to 'acquiring'
3250 		 * the 'mutex', hence the corresponding lockdep
3251 		 * mutex_release() has to happen in the fast path of
3252 		 * unlock_sock_fast().
3253 		 */
3254 		return false;
3255 	}
3256 
3257 	__lock_sock(sk);
3258 	sk->sk_lock.owned = 1;
3259 	__acquire(&sk->sk_lock.slock);
3260 	spin_unlock_bh(&sk->sk_lock.slock);
3261 	return true;
3262 }
3263 EXPORT_SYMBOL(__lock_sock_fast);
3264 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3265 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3266 		   bool timeval, bool time32)
3267 {
3268 	struct sock *sk = sock->sk;
3269 	struct timespec64 ts;
3270 
3271 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3272 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3273 	if (ts.tv_sec == -1)
3274 		return -ENOENT;
3275 	if (ts.tv_sec == 0) {
3276 		ktime_t kt = ktime_get_real();
3277 		sock_write_timestamp(sk, kt);
3278 		ts = ktime_to_timespec64(kt);
3279 	}
3280 
3281 	if (timeval)
3282 		ts.tv_nsec /= 1000;
3283 
3284 #ifdef CONFIG_COMPAT_32BIT_TIME
3285 	if (time32)
3286 		return put_old_timespec32(&ts, userstamp);
3287 #endif
3288 #ifdef CONFIG_SPARC64
3289 	/* beware of padding in sparc64 timeval */
3290 	if (timeval && !in_compat_syscall()) {
3291 		struct __kernel_old_timeval __user tv = {
3292 			.tv_sec = ts.tv_sec,
3293 			.tv_usec = ts.tv_nsec,
3294 		};
3295 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3296 			return -EFAULT;
3297 		return 0;
3298 	}
3299 #endif
3300 	return put_timespec64(&ts, userstamp);
3301 }
3302 EXPORT_SYMBOL(sock_gettstamp);
3303 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3304 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3305 {
3306 	if (!sock_flag(sk, flag)) {
3307 		unsigned long previous_flags = sk->sk_flags;
3308 
3309 		sock_set_flag(sk, flag);
3310 		/*
3311 		 * we just set one of the two flags which require net
3312 		 * time stamping, but time stamping might have been on
3313 		 * already because of the other one
3314 		 */
3315 		if (sock_needs_netstamp(sk) &&
3316 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3317 			net_enable_timestamp();
3318 	}
3319 }
3320 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3321 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3322 		       int level, int type)
3323 {
3324 	struct sock_exterr_skb *serr;
3325 	struct sk_buff *skb;
3326 	int copied, err;
3327 
3328 	err = -EAGAIN;
3329 	skb = sock_dequeue_err_skb(sk);
3330 	if (skb == NULL)
3331 		goto out;
3332 
3333 	copied = skb->len;
3334 	if (copied > len) {
3335 		msg->msg_flags |= MSG_TRUNC;
3336 		copied = len;
3337 	}
3338 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3339 	if (err)
3340 		goto out_free_skb;
3341 
3342 	sock_recv_timestamp(msg, sk, skb);
3343 
3344 	serr = SKB_EXT_ERR(skb);
3345 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3346 
3347 	msg->msg_flags |= MSG_ERRQUEUE;
3348 	err = copied;
3349 
3350 out_free_skb:
3351 	kfree_skb(skb);
3352 out:
3353 	return err;
3354 }
3355 EXPORT_SYMBOL(sock_recv_errqueue);
3356 
3357 /*
3358  *	Get a socket option on an socket.
3359  *
3360  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3361  *	asynchronous errors should be reported by getsockopt. We assume
3362  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3363  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3364 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3365 			   char __user *optval, int __user *optlen)
3366 {
3367 	struct sock *sk = sock->sk;
3368 
3369 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3370 }
3371 EXPORT_SYMBOL(sock_common_getsockopt);
3372 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3373 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3374 			int flags)
3375 {
3376 	struct sock *sk = sock->sk;
3377 	int addr_len = 0;
3378 	int err;
3379 
3380 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3381 				   flags & ~MSG_DONTWAIT, &addr_len);
3382 	if (err >= 0)
3383 		msg->msg_namelen = addr_len;
3384 	return err;
3385 }
3386 EXPORT_SYMBOL(sock_common_recvmsg);
3387 
3388 /*
3389  *	Set socket options on an inet socket.
3390  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3391 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3392 			   sockptr_t optval, unsigned int optlen)
3393 {
3394 	struct sock *sk = sock->sk;
3395 
3396 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3397 }
3398 EXPORT_SYMBOL(sock_common_setsockopt);
3399 
sk_common_release(struct sock * sk)3400 void sk_common_release(struct sock *sk)
3401 {
3402 	if (sk->sk_prot->destroy)
3403 		sk->sk_prot->destroy(sk);
3404 
3405 	/*
3406 	 * Observation: when sk_common_release is called, processes have
3407 	 * no access to socket. But net still has.
3408 	 * Step one, detach it from networking:
3409 	 *
3410 	 * A. Remove from hash tables.
3411 	 */
3412 
3413 	sk->sk_prot->unhash(sk);
3414 
3415 	/*
3416 	 * In this point socket cannot receive new packets, but it is possible
3417 	 * that some packets are in flight because some CPU runs receiver and
3418 	 * did hash table lookup before we unhashed socket. They will achieve
3419 	 * receive queue and will be purged by socket destructor.
3420 	 *
3421 	 * Also we still have packets pending on receive queue and probably,
3422 	 * our own packets waiting in device queues. sock_destroy will drain
3423 	 * receive queue, but transmitted packets will delay socket destruction
3424 	 * until the last reference will be released.
3425 	 */
3426 
3427 	sock_orphan(sk);
3428 
3429 	xfrm_sk_free_policy(sk);
3430 
3431 	sk_refcnt_debug_release(sk);
3432 
3433 	sock_put(sk);
3434 }
3435 EXPORT_SYMBOL(sk_common_release);
3436 
sk_get_meminfo(const struct sock * sk,u32 * mem)3437 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3438 {
3439 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3440 
3441 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3442 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3443 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3444 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3445 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3446 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3447 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3448 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3449 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3450 }
3451 
3452 #ifdef CONFIG_PROC_FS
3453 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3454 struct prot_inuse {
3455 	int val[PROTO_INUSE_NR];
3456 };
3457 
3458 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3459 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3460 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3461 {
3462 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3463 }
3464 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3465 
sock_prot_inuse_get(struct net * net,struct proto * prot)3466 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3467 {
3468 	int cpu, idx = prot->inuse_idx;
3469 	int res = 0;
3470 
3471 	for_each_possible_cpu(cpu)
3472 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3473 
3474 	return res >= 0 ? res : 0;
3475 }
3476 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3477 
sock_inuse_add(struct net * net,int val)3478 static void sock_inuse_add(struct net *net, int val)
3479 {
3480 	this_cpu_add(*net->core.sock_inuse, val);
3481 }
3482 
sock_inuse_get(struct net * net)3483 int sock_inuse_get(struct net *net)
3484 {
3485 	int cpu, res = 0;
3486 
3487 	for_each_possible_cpu(cpu)
3488 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3489 
3490 	return res;
3491 }
3492 
3493 EXPORT_SYMBOL_GPL(sock_inuse_get);
3494 
sock_inuse_init_net(struct net * net)3495 static int __net_init sock_inuse_init_net(struct net *net)
3496 {
3497 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3498 	if (net->core.prot_inuse == NULL)
3499 		return -ENOMEM;
3500 
3501 	net->core.sock_inuse = alloc_percpu(int);
3502 	if (net->core.sock_inuse == NULL)
3503 		goto out;
3504 
3505 	return 0;
3506 
3507 out:
3508 	free_percpu(net->core.prot_inuse);
3509 	return -ENOMEM;
3510 }
3511 
sock_inuse_exit_net(struct net * net)3512 static void __net_exit sock_inuse_exit_net(struct net *net)
3513 {
3514 	free_percpu(net->core.prot_inuse);
3515 	free_percpu(net->core.sock_inuse);
3516 }
3517 
3518 static struct pernet_operations net_inuse_ops = {
3519 	.init = sock_inuse_init_net,
3520 	.exit = sock_inuse_exit_net,
3521 };
3522 
net_inuse_init(void)3523 static __init int net_inuse_init(void)
3524 {
3525 	if (register_pernet_subsys(&net_inuse_ops))
3526 		panic("Cannot initialize net inuse counters");
3527 
3528 	return 0;
3529 }
3530 
3531 core_initcall(net_inuse_init);
3532 
assign_proto_idx(struct proto * prot)3533 static int assign_proto_idx(struct proto *prot)
3534 {
3535 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3536 
3537 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3538 		pr_err("PROTO_INUSE_NR exhausted\n");
3539 		return -ENOSPC;
3540 	}
3541 
3542 	set_bit(prot->inuse_idx, proto_inuse_idx);
3543 	return 0;
3544 }
3545 
release_proto_idx(struct proto * prot)3546 static void release_proto_idx(struct proto *prot)
3547 {
3548 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3549 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3550 }
3551 #else
assign_proto_idx(struct proto * prot)3552 static inline int assign_proto_idx(struct proto *prot)
3553 {
3554 	return 0;
3555 }
3556 
release_proto_idx(struct proto * prot)3557 static inline void release_proto_idx(struct proto *prot)
3558 {
3559 }
3560 
sock_inuse_add(struct net * net,int val)3561 static void sock_inuse_add(struct net *net, int val)
3562 {
3563 }
3564 #endif
3565 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3566 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3567 {
3568 	if (!twsk_prot)
3569 		return;
3570 	kfree(twsk_prot->twsk_slab_name);
3571 	twsk_prot->twsk_slab_name = NULL;
3572 	kmem_cache_destroy(twsk_prot->twsk_slab);
3573 	twsk_prot->twsk_slab = NULL;
3574 }
3575 
tw_prot_init(const struct proto * prot)3576 static int tw_prot_init(const struct proto *prot)
3577 {
3578 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3579 
3580 	if (!twsk_prot)
3581 		return 0;
3582 
3583 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3584 					      prot->name);
3585 	if (!twsk_prot->twsk_slab_name)
3586 		return -ENOMEM;
3587 
3588 	twsk_prot->twsk_slab =
3589 		kmem_cache_create(twsk_prot->twsk_slab_name,
3590 				  twsk_prot->twsk_obj_size, 0,
3591 				  SLAB_ACCOUNT | prot->slab_flags,
3592 				  NULL);
3593 	if (!twsk_prot->twsk_slab) {
3594 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3595 			prot->name);
3596 		return -ENOMEM;
3597 	}
3598 
3599 	return 0;
3600 }
3601 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3602 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3603 {
3604 	if (!rsk_prot)
3605 		return;
3606 	kfree(rsk_prot->slab_name);
3607 	rsk_prot->slab_name = NULL;
3608 	kmem_cache_destroy(rsk_prot->slab);
3609 	rsk_prot->slab = NULL;
3610 }
3611 
req_prot_init(const struct proto * prot)3612 static int req_prot_init(const struct proto *prot)
3613 {
3614 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3615 
3616 	if (!rsk_prot)
3617 		return 0;
3618 
3619 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3620 					prot->name);
3621 	if (!rsk_prot->slab_name)
3622 		return -ENOMEM;
3623 
3624 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3625 					   rsk_prot->obj_size, 0,
3626 					   SLAB_ACCOUNT | prot->slab_flags,
3627 					   NULL);
3628 
3629 	if (!rsk_prot->slab) {
3630 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3631 			prot->name);
3632 		return -ENOMEM;
3633 	}
3634 	return 0;
3635 }
3636 
proto_register(struct proto * prot,int alloc_slab)3637 int proto_register(struct proto *prot, int alloc_slab)
3638 {
3639 	int ret = -ENOBUFS;
3640 
3641 	if (alloc_slab) {
3642 		prot->slab = kmem_cache_create_usercopy(prot->name,
3643 					prot->obj_size, 0,
3644 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3645 					prot->slab_flags,
3646 					prot->useroffset, prot->usersize,
3647 					NULL);
3648 
3649 		if (prot->slab == NULL) {
3650 			pr_crit("%s: Can't create sock SLAB cache!\n",
3651 				prot->name);
3652 			goto out;
3653 		}
3654 
3655 		if (req_prot_init(prot))
3656 			goto out_free_request_sock_slab;
3657 
3658 		if (tw_prot_init(prot))
3659 			goto out_free_timewait_sock_slab;
3660 	}
3661 
3662 	mutex_lock(&proto_list_mutex);
3663 	ret = assign_proto_idx(prot);
3664 	if (ret) {
3665 		mutex_unlock(&proto_list_mutex);
3666 		goto out_free_timewait_sock_slab;
3667 	}
3668 	list_add(&prot->node, &proto_list);
3669 	mutex_unlock(&proto_list_mutex);
3670 	return ret;
3671 
3672 out_free_timewait_sock_slab:
3673 	if (alloc_slab)
3674 		tw_prot_cleanup(prot->twsk_prot);
3675 out_free_request_sock_slab:
3676 	if (alloc_slab) {
3677 		req_prot_cleanup(prot->rsk_prot);
3678 
3679 		kmem_cache_destroy(prot->slab);
3680 		prot->slab = NULL;
3681 	}
3682 out:
3683 	return ret;
3684 }
3685 EXPORT_SYMBOL(proto_register);
3686 
proto_unregister(struct proto * prot)3687 void proto_unregister(struct proto *prot)
3688 {
3689 	mutex_lock(&proto_list_mutex);
3690 	release_proto_idx(prot);
3691 	list_del(&prot->node);
3692 	mutex_unlock(&proto_list_mutex);
3693 
3694 	kmem_cache_destroy(prot->slab);
3695 	prot->slab = NULL;
3696 
3697 	req_prot_cleanup(prot->rsk_prot);
3698 	tw_prot_cleanup(prot->twsk_prot);
3699 }
3700 EXPORT_SYMBOL(proto_unregister);
3701 
sock_load_diag_module(int family,int protocol)3702 int sock_load_diag_module(int family, int protocol)
3703 {
3704 	if (!protocol) {
3705 		if (!sock_is_registered(family))
3706 			return -ENOENT;
3707 
3708 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3709 				      NETLINK_SOCK_DIAG, family);
3710 	}
3711 
3712 #ifdef CONFIG_INET
3713 	if (family == AF_INET &&
3714 	    protocol != IPPROTO_RAW &&
3715 	    protocol < MAX_INET_PROTOS &&
3716 	    !rcu_access_pointer(inet_protos[protocol]))
3717 		return -ENOENT;
3718 #endif
3719 
3720 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3721 			      NETLINK_SOCK_DIAG, family, protocol);
3722 }
3723 EXPORT_SYMBOL(sock_load_diag_module);
3724 
3725 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3726 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3727 	__acquires(proto_list_mutex)
3728 {
3729 	mutex_lock(&proto_list_mutex);
3730 	return seq_list_start_head(&proto_list, *pos);
3731 }
3732 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3733 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3734 {
3735 	return seq_list_next(v, &proto_list, pos);
3736 }
3737 
proto_seq_stop(struct seq_file * seq,void * v)3738 static void proto_seq_stop(struct seq_file *seq, void *v)
3739 	__releases(proto_list_mutex)
3740 {
3741 	mutex_unlock(&proto_list_mutex);
3742 }
3743 
proto_method_implemented(const void * method)3744 static char proto_method_implemented(const void *method)
3745 {
3746 	return method == NULL ? 'n' : 'y';
3747 }
sock_prot_memory_allocated(struct proto * proto)3748 static long sock_prot_memory_allocated(struct proto *proto)
3749 {
3750 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3751 }
3752 
sock_prot_memory_pressure(struct proto * proto)3753 static const char *sock_prot_memory_pressure(struct proto *proto)
3754 {
3755 	return proto->memory_pressure != NULL ?
3756 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3757 }
3758 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3759 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3760 {
3761 
3762 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3763 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3764 		   proto->name,
3765 		   proto->obj_size,
3766 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3767 		   sock_prot_memory_allocated(proto),
3768 		   sock_prot_memory_pressure(proto),
3769 		   proto->max_header,
3770 		   proto->slab == NULL ? "no" : "yes",
3771 		   module_name(proto->owner),
3772 		   proto_method_implemented(proto->close),
3773 		   proto_method_implemented(proto->connect),
3774 		   proto_method_implemented(proto->disconnect),
3775 		   proto_method_implemented(proto->accept),
3776 		   proto_method_implemented(proto->ioctl),
3777 		   proto_method_implemented(proto->init),
3778 		   proto_method_implemented(proto->destroy),
3779 		   proto_method_implemented(proto->shutdown),
3780 		   proto_method_implemented(proto->setsockopt),
3781 		   proto_method_implemented(proto->getsockopt),
3782 		   proto_method_implemented(proto->sendmsg),
3783 		   proto_method_implemented(proto->recvmsg),
3784 		   proto_method_implemented(proto->sendpage),
3785 		   proto_method_implemented(proto->bind),
3786 		   proto_method_implemented(proto->backlog_rcv),
3787 		   proto_method_implemented(proto->hash),
3788 		   proto_method_implemented(proto->unhash),
3789 		   proto_method_implemented(proto->get_port),
3790 		   proto_method_implemented(proto->enter_memory_pressure));
3791 }
3792 
proto_seq_show(struct seq_file * seq,void * v)3793 static int proto_seq_show(struct seq_file *seq, void *v)
3794 {
3795 	if (v == &proto_list)
3796 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3797 			   "protocol",
3798 			   "size",
3799 			   "sockets",
3800 			   "memory",
3801 			   "press",
3802 			   "maxhdr",
3803 			   "slab",
3804 			   "module",
3805 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3806 	else
3807 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3808 	return 0;
3809 }
3810 
3811 static const struct seq_operations proto_seq_ops = {
3812 	.start  = proto_seq_start,
3813 	.next   = proto_seq_next,
3814 	.stop   = proto_seq_stop,
3815 	.show   = proto_seq_show,
3816 };
3817 
proto_init_net(struct net * net)3818 static __net_init int proto_init_net(struct net *net)
3819 {
3820 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3821 			sizeof(struct seq_net_private)))
3822 		return -ENOMEM;
3823 
3824 	return 0;
3825 }
3826 
proto_exit_net(struct net * net)3827 static __net_exit void proto_exit_net(struct net *net)
3828 {
3829 	remove_proc_entry("protocols", net->proc_net);
3830 }
3831 
3832 
3833 static __net_initdata struct pernet_operations proto_net_ops = {
3834 	.init = proto_init_net,
3835 	.exit = proto_exit_net,
3836 };
3837 
proto_init(void)3838 static int __init proto_init(void)
3839 {
3840 	return register_pernet_subsys(&proto_net_ops);
3841 }
3842 
3843 subsys_initcall(proto_init);
3844 
3845 #endif /* PROC_FS */
3846 
3847 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3848 bool sk_busy_loop_end(void *p, unsigned long start_time)
3849 {
3850 	struct sock *sk = p;
3851 
3852 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3853 	       sk_busy_loop_timeout(sk, start_time);
3854 }
3855 EXPORT_SYMBOL(sk_busy_loop_end);
3856 #endif /* CONFIG_NET_RX_BUSY_POLL */
3857 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3858 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3859 {
3860 	if (!sk->sk_prot->bind_add)
3861 		return -EOPNOTSUPP;
3862 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3863 }
3864 EXPORT_SYMBOL(sock_bind_add);
3865