1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117
118 #include <linux/uaccess.h>
119
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136
137 #include <trace/events/sock.h>
138
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144
145 static void sock_inuse_add(struct net *net, int val);
146
147 /**
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
152 *
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
156 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)157 bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
159 {
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164
165 /**
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
169 *
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
173 */
sk_capable(const struct sock * sk,int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179
180 /**
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
184 *
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
188 */
sk_net_capable(const struct sock * sk,int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194
195 /*
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family and separate keys for internal and
198 * userspace sockets.
199 */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204
205 /*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210
211 #define _sock_locks(x) \
212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
221 x "27" , x "28" , x "AF_CAN" , \
222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
227 x "AF_MAX"
228
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 _sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 _sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 _sock_locks("clock-")
237 };
238
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 _sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 _sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 _sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 _sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 _sock_locks("elock-")
256 };
257
258 /*
259 * sk_callback_lock and sk queues locking rules are per-address-family,
260 * so split the lock classes by using a per-AF key:
261 */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284
285 /**
286 * sk_set_memalloc - sets %SOCK_MEMALLOC
287 * @sk: socket to set it on
288 *
289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290 * It's the responsibility of the admin to adjust min_free_kbytes
291 * to meet the requirements
292 */
sk_set_memalloc(struct sock * sk)293 void sk_set_memalloc(struct sock *sk)
294 {
295 sock_set_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation |= __GFP_MEMALLOC;
297 static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300
sk_clear_memalloc(struct sock * sk)301 void sk_clear_memalloc(struct sock *sk)
302 {
303 sock_reset_flag(sk, SOCK_MEMALLOC);
304 sk->sk_allocation &= ~__GFP_MEMALLOC;
305 static_branch_dec(&memalloc_socks_key);
306
307 /*
308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 * it has rmem allocations due to the last swapfile being deactivated
311 * but there is a risk that the socket is unusable due to exceeding
312 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 */
314 sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 int ret;
321 unsigned int noreclaim_flag;
322
323 /* these should have been dropped before queueing */
324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325
326 noreclaim_flag = memalloc_noreclaim_save();
327 ret = sk->sk_backlog_rcv(sk, skb);
328 memalloc_noreclaim_restore(noreclaim_flag);
329
330 return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333
sock_get_timeout(long timeo,void * optval,bool old_timeval)334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 struct __kernel_sock_timeval tv;
337
338 if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 tv.tv_sec = 0;
340 tv.tv_usec = 0;
341 } else {
342 tv.tv_sec = timeo / HZ;
343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 }
345
346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 *(struct old_timeval32 *)optval = tv32;
349 return sizeof(tv32);
350 }
351
352 if (old_timeval) {
353 struct __kernel_old_timeval old_tv;
354 old_tv.tv_sec = tv.tv_sec;
355 old_tv.tv_usec = tv.tv_usec;
356 *(struct __kernel_old_timeval *)optval = old_tv;
357 return sizeof(old_tv);
358 }
359
360 *(struct __kernel_sock_timeval *)optval = tv;
361 return sizeof(tv);
362 }
363
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 bool old_timeval)
366 {
367 struct __kernel_sock_timeval tv;
368
369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 struct old_timeval32 tv32;
371
372 if (optlen < sizeof(tv32))
373 return -EINVAL;
374
375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 return -EFAULT;
377 tv.tv_sec = tv32.tv_sec;
378 tv.tv_usec = tv32.tv_usec;
379 } else if (old_timeval) {
380 struct __kernel_old_timeval old_tv;
381
382 if (optlen < sizeof(old_tv))
383 return -EINVAL;
384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 return -EFAULT;
386 tv.tv_sec = old_tv.tv_sec;
387 tv.tv_usec = old_tv.tv_usec;
388 } else {
389 if (optlen < sizeof(tv))
390 return -EINVAL;
391 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 return -EFAULT;
393 }
394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 return -EDOM;
396
397 if (tv.tv_sec < 0) {
398 static int warned __read_mostly;
399
400 *timeo_p = 0;
401 if (warned < 10 && net_ratelimit()) {
402 warned++;
403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 __func__, current->comm, task_pid_nr(current));
405 }
406 return 0;
407 }
408 *timeo_p = MAX_SCHEDULE_TIMEOUT;
409 if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 return 0;
411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 return 0;
414 }
415
sock_needs_netstamp(const struct sock * sk)416 static bool sock_needs_netstamp(const struct sock *sk)
417 {
418 switch (sk->sk_family) {
419 case AF_UNSPEC:
420 case AF_UNIX:
421 return false;
422 default:
423 return true;
424 }
425 }
426
sock_disable_timestamp(struct sock * sk,unsigned long flags)427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428 {
429 if (sk->sk_flags & flags) {
430 sk->sk_flags &= ~flags;
431 if (sock_needs_netstamp(sk) &&
432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 net_disable_timestamp();
434 }
435 }
436
437
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 {
440 unsigned long flags;
441 struct sk_buff_head *list = &sk->sk_receive_queue;
442
443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 atomic_inc(&sk->sk_drops);
445 trace_sock_rcvqueue_full(sk, skb);
446 return -ENOMEM;
447 }
448
449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 atomic_inc(&sk->sk_drops);
451 return -ENOBUFS;
452 }
453
454 skb->dev = NULL;
455 skb_set_owner_r(skb, sk);
456
457 /* we escape from rcu protected region, make sure we dont leak
458 * a norefcounted dst
459 */
460 skb_dst_force(skb);
461
462 spin_lock_irqsave(&list->lock, flags);
463 sock_skb_set_dropcount(sk, skb);
464 __skb_queue_tail(list, skb);
465 spin_unlock_irqrestore(&list->lock, flags);
466
467 if (!sock_flag(sk, SOCK_DEAD))
468 sk->sk_data_ready(sk);
469 return 0;
470 }
471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
472
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 int err;
476
477 err = sk_filter(sk, skb);
478 if (err)
479 return err;
480
481 return __sock_queue_rcv_skb(sk, skb);
482 }
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
484
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 const int nested, unsigned int trim_cap, bool refcounted)
487 {
488 int rc = NET_RX_SUCCESS;
489
490 if (sk_filter_trim_cap(sk, skb, trim_cap))
491 goto discard_and_relse;
492
493 skb->dev = NULL;
494
495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 atomic_inc(&sk->sk_drops);
497 goto discard_and_relse;
498 }
499 if (nested)
500 bh_lock_sock_nested(sk);
501 else
502 bh_lock_sock(sk);
503 if (!sock_owned_by_user(sk)) {
504 /*
505 * trylock + unlock semantics:
506 */
507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508
509 rc = sk_backlog_rcv(sk, skb);
510
511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 bh_unlock_sock(sk);
514 atomic_inc(&sk->sk_drops);
515 goto discard_and_relse;
516 }
517
518 bh_unlock_sock(sk);
519 out:
520 if (refcounted)
521 sock_put(sk);
522 return rc;
523 discard_and_relse:
524 kfree_skb(skb);
525 goto out;
526 }
527 EXPORT_SYMBOL(__sk_receive_skb);
528
__sk_dst_check(struct sock * sk,u32 cookie)529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
530 {
531 struct dst_entry *dst = __sk_dst_get(sk);
532
533 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
534 sk_tx_queue_clear(sk);
535 sk->sk_dst_pending_confirm = 0;
536 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
537 dst_release(dst);
538 return NULL;
539 }
540
541 return dst;
542 }
543 EXPORT_SYMBOL(__sk_dst_check);
544
sk_dst_check(struct sock * sk,u32 cookie)545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
546 {
547 struct dst_entry *dst = sk_dst_get(sk);
548
549 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
550 sk_dst_reset(sk);
551 dst_release(dst);
552 return NULL;
553 }
554
555 return dst;
556 }
557 EXPORT_SYMBOL(sk_dst_check);
558
sock_bindtoindex_locked(struct sock * sk,int ifindex)559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
560 {
561 int ret = -ENOPROTOOPT;
562 #ifdef CONFIG_NETDEVICES
563 struct net *net = sock_net(sk);
564
565 /* Sorry... */
566 ret = -EPERM;
567 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
568 goto out;
569
570 ret = -EINVAL;
571 if (ifindex < 0)
572 goto out;
573
574 sk->sk_bound_dev_if = ifindex;
575 if (sk->sk_prot->rehash)
576 sk->sk_prot->rehash(sk);
577 sk_dst_reset(sk);
578
579 ret = 0;
580
581 out:
582 #endif
583
584 return ret;
585 }
586
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
588 {
589 int ret;
590
591 if (lock_sk)
592 lock_sock(sk);
593 ret = sock_bindtoindex_locked(sk, ifindex);
594 if (lock_sk)
595 release_sock(sk);
596
597 return ret;
598 }
599 EXPORT_SYMBOL(sock_bindtoindex);
600
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
602 {
603 int ret = -ENOPROTOOPT;
604 #ifdef CONFIG_NETDEVICES
605 struct net *net = sock_net(sk);
606 char devname[IFNAMSIZ];
607 int index;
608
609 ret = -EINVAL;
610 if (optlen < 0)
611 goto out;
612
613 /* Bind this socket to a particular device like "eth0",
614 * as specified in the passed interface name. If the
615 * name is "" or the option length is zero the socket
616 * is not bound.
617 */
618 if (optlen > IFNAMSIZ - 1)
619 optlen = IFNAMSIZ - 1;
620 memset(devname, 0, sizeof(devname));
621
622 ret = -EFAULT;
623 if (copy_from_sockptr(devname, optval, optlen))
624 goto out;
625
626 index = 0;
627 if (devname[0] != '\0') {
628 struct net_device *dev;
629
630 rcu_read_lock();
631 dev = dev_get_by_name_rcu(net, devname);
632 if (dev)
633 index = dev->ifindex;
634 rcu_read_unlock();
635 ret = -ENODEV;
636 if (!dev)
637 goto out;
638 }
639
640 return sock_bindtoindex(sk, index, true);
641 out:
642 #endif
643
644 return ret;
645 }
646
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
648 int __user *optlen, int len)
649 {
650 int ret = -ENOPROTOOPT;
651 #ifdef CONFIG_NETDEVICES
652 struct net *net = sock_net(sk);
653 char devname[IFNAMSIZ];
654
655 if (sk->sk_bound_dev_if == 0) {
656 len = 0;
657 goto zero;
658 }
659
660 ret = -EINVAL;
661 if (len < IFNAMSIZ)
662 goto out;
663
664 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
665 if (ret)
666 goto out;
667
668 len = strlen(devname) + 1;
669
670 ret = -EFAULT;
671 if (copy_to_user(optval, devname, len))
672 goto out;
673
674 zero:
675 ret = -EFAULT;
676 if (put_user(len, optlen))
677 goto out;
678
679 ret = 0;
680
681 out:
682 #endif
683
684 return ret;
685 }
686
sk_mc_loop(struct sock * sk)687 bool sk_mc_loop(struct sock *sk)
688 {
689 if (dev_recursion_level())
690 return false;
691 if (!sk)
692 return true;
693 switch (sk->sk_family) {
694 case AF_INET:
695 return inet_sk(sk)->mc_loop;
696 #if IS_ENABLED(CONFIG_IPV6)
697 case AF_INET6:
698 return inet6_sk(sk)->mc_loop;
699 #endif
700 }
701 WARN_ON_ONCE(1);
702 return true;
703 }
704 EXPORT_SYMBOL(sk_mc_loop);
705
sock_set_reuseaddr(struct sock * sk)706 void sock_set_reuseaddr(struct sock *sk)
707 {
708 lock_sock(sk);
709 sk->sk_reuse = SK_CAN_REUSE;
710 release_sock(sk);
711 }
712 EXPORT_SYMBOL(sock_set_reuseaddr);
713
sock_set_reuseport(struct sock * sk)714 void sock_set_reuseport(struct sock *sk)
715 {
716 lock_sock(sk);
717 sk->sk_reuseport = true;
718 release_sock(sk);
719 }
720 EXPORT_SYMBOL(sock_set_reuseport);
721
sock_no_linger(struct sock * sk)722 void sock_no_linger(struct sock *sk)
723 {
724 lock_sock(sk);
725 sk->sk_lingertime = 0;
726 sock_set_flag(sk, SOCK_LINGER);
727 release_sock(sk);
728 }
729 EXPORT_SYMBOL(sock_no_linger);
730
sock_set_priority(struct sock * sk,u32 priority)731 void sock_set_priority(struct sock *sk, u32 priority)
732 {
733 lock_sock(sk);
734 sk->sk_priority = priority;
735 release_sock(sk);
736 }
737 EXPORT_SYMBOL(sock_set_priority);
738
sock_set_sndtimeo(struct sock * sk,s64 secs)739 void sock_set_sndtimeo(struct sock *sk, s64 secs)
740 {
741 lock_sock(sk);
742 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
743 sk->sk_sndtimeo = secs * HZ;
744 else
745 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
746 release_sock(sk);
747 }
748 EXPORT_SYMBOL(sock_set_sndtimeo);
749
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
751 {
752 if (val) {
753 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
754 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
755 sock_set_flag(sk, SOCK_RCVTSTAMP);
756 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
757 } else {
758 sock_reset_flag(sk, SOCK_RCVTSTAMP);
759 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
760 }
761 }
762
sock_enable_timestamps(struct sock * sk)763 void sock_enable_timestamps(struct sock *sk)
764 {
765 lock_sock(sk);
766 __sock_set_timestamps(sk, true, false, true);
767 release_sock(sk);
768 }
769 EXPORT_SYMBOL(sock_enable_timestamps);
770
sock_set_keepalive(struct sock * sk)771 void sock_set_keepalive(struct sock *sk)
772 {
773 lock_sock(sk);
774 if (sk->sk_prot->keepalive)
775 sk->sk_prot->keepalive(sk, true);
776 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
777 release_sock(sk);
778 }
779 EXPORT_SYMBOL(sock_set_keepalive);
780
__sock_set_rcvbuf(struct sock * sk,int val)781 static void __sock_set_rcvbuf(struct sock *sk, int val)
782 {
783 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
784 * as a negative value.
785 */
786 val = min_t(int, val, INT_MAX / 2);
787 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
788
789 /* We double it on the way in to account for "struct sk_buff" etc.
790 * overhead. Applications assume that the SO_RCVBUF setting they make
791 * will allow that much actual data to be received on that socket.
792 *
793 * Applications are unaware that "struct sk_buff" and other overheads
794 * allocate from the receive buffer during socket buffer allocation.
795 *
796 * And after considering the possible alternatives, returning the value
797 * we actually used in getsockopt is the most desirable behavior.
798 */
799 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
800 }
801
sock_set_rcvbuf(struct sock * sk,int val)802 void sock_set_rcvbuf(struct sock *sk, int val)
803 {
804 lock_sock(sk);
805 __sock_set_rcvbuf(sk, val);
806 release_sock(sk);
807 }
808 EXPORT_SYMBOL(sock_set_rcvbuf);
809
sock_set_mark(struct sock * sk,u32 val)810 void sock_set_mark(struct sock *sk, u32 val)
811 {
812 lock_sock(sk);
813 sk->sk_mark = val;
814 release_sock(sk);
815 }
816 EXPORT_SYMBOL(sock_set_mark);
817
818 /*
819 * This is meant for all protocols to use and covers goings on
820 * at the socket level. Everything here is generic.
821 */
822
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)823 int sock_setsockopt(struct socket *sock, int level, int optname,
824 sockptr_t optval, unsigned int optlen)
825 {
826 struct sock_txtime sk_txtime;
827 struct sock *sk = sock->sk;
828 int val;
829 int valbool;
830 struct linger ling;
831 int ret = 0;
832
833 /*
834 * Options without arguments
835 */
836
837 if (optname == SO_BINDTODEVICE)
838 return sock_setbindtodevice(sk, optval, optlen);
839
840 if (optlen < sizeof(int))
841 return -EINVAL;
842
843 if (copy_from_sockptr(&val, optval, sizeof(val)))
844 return -EFAULT;
845
846 valbool = val ? 1 : 0;
847
848 lock_sock(sk);
849
850 switch (optname) {
851 case SO_DEBUG:
852 if (val && !capable(CAP_NET_ADMIN))
853 ret = -EACCES;
854 else
855 sock_valbool_flag(sk, SOCK_DBG, valbool);
856 break;
857 case SO_REUSEADDR:
858 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
859 break;
860 case SO_REUSEPORT:
861 sk->sk_reuseport = valbool;
862 break;
863 case SO_TYPE:
864 case SO_PROTOCOL:
865 case SO_DOMAIN:
866 case SO_ERROR:
867 ret = -ENOPROTOOPT;
868 break;
869 case SO_DONTROUTE:
870 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
871 sk_dst_reset(sk);
872 break;
873 case SO_BROADCAST:
874 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
875 break;
876 case SO_SNDBUF:
877 /* Don't error on this BSD doesn't and if you think
878 * about it this is right. Otherwise apps have to
879 * play 'guess the biggest size' games. RCVBUF/SNDBUF
880 * are treated in BSD as hints
881 */
882 val = min_t(u32, val, sysctl_wmem_max);
883 set_sndbuf:
884 /* Ensure val * 2 fits into an int, to prevent max_t()
885 * from treating it as a negative value.
886 */
887 val = min_t(int, val, INT_MAX / 2);
888 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
889 WRITE_ONCE(sk->sk_sndbuf,
890 max_t(int, val * 2, SOCK_MIN_SNDBUF));
891 /* Wake up sending tasks if we upped the value. */
892 sk->sk_write_space(sk);
893 break;
894
895 case SO_SNDBUFFORCE:
896 if (!capable(CAP_NET_ADMIN)) {
897 ret = -EPERM;
898 break;
899 }
900
901 /* No negative values (to prevent underflow, as val will be
902 * multiplied by 2).
903 */
904 if (val < 0)
905 val = 0;
906 goto set_sndbuf;
907
908 case SO_RCVBUF:
909 /* Don't error on this BSD doesn't and if you think
910 * about it this is right. Otherwise apps have to
911 * play 'guess the biggest size' games. RCVBUF/SNDBUF
912 * are treated in BSD as hints
913 */
914 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
915 break;
916
917 case SO_RCVBUFFORCE:
918 if (!capable(CAP_NET_ADMIN)) {
919 ret = -EPERM;
920 break;
921 }
922
923 /* No negative values (to prevent underflow, as val will be
924 * multiplied by 2).
925 */
926 __sock_set_rcvbuf(sk, max(val, 0));
927 break;
928
929 case SO_KEEPALIVE:
930 if (sk->sk_prot->keepalive)
931 sk->sk_prot->keepalive(sk, valbool);
932 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
933 break;
934
935 case SO_OOBINLINE:
936 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
937 break;
938
939 case SO_NO_CHECK:
940 sk->sk_no_check_tx = valbool;
941 break;
942
943 case SO_PRIORITY:
944 if ((val >= 0 && val <= 6) ||
945 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
946 sk->sk_priority = val;
947 else
948 ret = -EPERM;
949 break;
950
951 case SO_LINGER:
952 if (optlen < sizeof(ling)) {
953 ret = -EINVAL; /* 1003.1g */
954 break;
955 }
956 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
957 ret = -EFAULT;
958 break;
959 }
960 if (!ling.l_onoff)
961 sock_reset_flag(sk, SOCK_LINGER);
962 else {
963 #if (BITS_PER_LONG == 32)
964 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
965 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
966 else
967 #endif
968 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
969 sock_set_flag(sk, SOCK_LINGER);
970 }
971 break;
972
973 case SO_BSDCOMPAT:
974 break;
975
976 case SO_PASSCRED:
977 if (valbool)
978 set_bit(SOCK_PASSCRED, &sock->flags);
979 else
980 clear_bit(SOCK_PASSCRED, &sock->flags);
981 break;
982
983 case SO_TIMESTAMP_OLD:
984 __sock_set_timestamps(sk, valbool, false, false);
985 break;
986 case SO_TIMESTAMP_NEW:
987 __sock_set_timestamps(sk, valbool, true, false);
988 break;
989 case SO_TIMESTAMPNS_OLD:
990 __sock_set_timestamps(sk, valbool, false, true);
991 break;
992 case SO_TIMESTAMPNS_NEW:
993 __sock_set_timestamps(sk, valbool, true, true);
994 break;
995 case SO_TIMESTAMPING_NEW:
996 case SO_TIMESTAMPING_OLD:
997 if (val & ~SOF_TIMESTAMPING_MASK) {
998 ret = -EINVAL;
999 break;
1000 }
1001
1002 if (val & SOF_TIMESTAMPING_OPT_ID &&
1003 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1004 if (sk->sk_protocol == IPPROTO_TCP &&
1005 sk->sk_type == SOCK_STREAM) {
1006 if ((1 << sk->sk_state) &
1007 (TCPF_CLOSE | TCPF_LISTEN)) {
1008 ret = -EINVAL;
1009 break;
1010 }
1011 sk->sk_tskey = tcp_sk(sk)->snd_una;
1012 } else {
1013 sk->sk_tskey = 0;
1014 }
1015 }
1016
1017 if (val & SOF_TIMESTAMPING_OPT_STATS &&
1018 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1019 ret = -EINVAL;
1020 break;
1021 }
1022
1023 sk->sk_tsflags = val;
1024 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1025
1026 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1027 sock_enable_timestamp(sk,
1028 SOCK_TIMESTAMPING_RX_SOFTWARE);
1029 else
1030 sock_disable_timestamp(sk,
1031 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1032 break;
1033
1034 case SO_RCVLOWAT:
1035 if (val < 0)
1036 val = INT_MAX;
1037 if (sock->ops->set_rcvlowat)
1038 ret = sock->ops->set_rcvlowat(sk, val);
1039 else
1040 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1041 break;
1042
1043 case SO_RCVTIMEO_OLD:
1044 case SO_RCVTIMEO_NEW:
1045 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1046 optlen, optname == SO_RCVTIMEO_OLD);
1047 break;
1048
1049 case SO_SNDTIMEO_OLD:
1050 case SO_SNDTIMEO_NEW:
1051 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1052 optlen, optname == SO_SNDTIMEO_OLD);
1053 break;
1054
1055 case SO_ATTACH_FILTER: {
1056 struct sock_fprog fprog;
1057
1058 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1059 if (!ret)
1060 ret = sk_attach_filter(&fprog, sk);
1061 break;
1062 }
1063 case SO_ATTACH_BPF:
1064 ret = -EINVAL;
1065 if (optlen == sizeof(u32)) {
1066 u32 ufd;
1067
1068 ret = -EFAULT;
1069 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1070 break;
1071
1072 ret = sk_attach_bpf(ufd, sk);
1073 }
1074 break;
1075
1076 case SO_ATTACH_REUSEPORT_CBPF: {
1077 struct sock_fprog fprog;
1078
1079 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1080 if (!ret)
1081 ret = sk_reuseport_attach_filter(&fprog, sk);
1082 break;
1083 }
1084 case SO_ATTACH_REUSEPORT_EBPF:
1085 ret = -EINVAL;
1086 if (optlen == sizeof(u32)) {
1087 u32 ufd;
1088
1089 ret = -EFAULT;
1090 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1091 break;
1092
1093 ret = sk_reuseport_attach_bpf(ufd, sk);
1094 }
1095 break;
1096
1097 case SO_DETACH_REUSEPORT_BPF:
1098 ret = reuseport_detach_prog(sk);
1099 break;
1100
1101 case SO_DETACH_FILTER:
1102 ret = sk_detach_filter(sk);
1103 break;
1104
1105 case SO_LOCK_FILTER:
1106 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1107 ret = -EPERM;
1108 else
1109 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1110 break;
1111
1112 case SO_PASSSEC:
1113 if (valbool)
1114 set_bit(SOCK_PASSSEC, &sock->flags);
1115 else
1116 clear_bit(SOCK_PASSSEC, &sock->flags);
1117 break;
1118 case SO_MARK:
1119 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1120 ret = -EPERM;
1121 } else if (val != sk->sk_mark) {
1122 sk->sk_mark = val;
1123 sk_dst_reset(sk);
1124 }
1125 break;
1126
1127 case SO_RXQ_OVFL:
1128 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1129 break;
1130
1131 case SO_WIFI_STATUS:
1132 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1133 break;
1134
1135 case SO_PEEK_OFF:
1136 if (sock->ops->set_peek_off)
1137 ret = sock->ops->set_peek_off(sk, val);
1138 else
1139 ret = -EOPNOTSUPP;
1140 break;
1141
1142 case SO_NOFCS:
1143 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1144 break;
1145
1146 case SO_SELECT_ERR_QUEUE:
1147 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1148 break;
1149
1150 #ifdef CONFIG_NET_RX_BUSY_POLL
1151 case SO_BUSY_POLL:
1152 /* allow unprivileged users to decrease the value */
1153 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1154 ret = -EPERM;
1155 else {
1156 if (val < 0)
1157 ret = -EINVAL;
1158 else
1159 sk->sk_ll_usec = val;
1160 }
1161 break;
1162 #endif
1163
1164 case SO_MAX_PACING_RATE:
1165 {
1166 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1167
1168 if (sizeof(ulval) != sizeof(val) &&
1169 optlen >= sizeof(ulval) &&
1170 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171 ret = -EFAULT;
1172 break;
1173 }
1174 if (ulval != ~0UL)
1175 cmpxchg(&sk->sk_pacing_status,
1176 SK_PACING_NONE,
1177 SK_PACING_NEEDED);
1178 sk->sk_max_pacing_rate = ulval;
1179 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1180 break;
1181 }
1182 case SO_INCOMING_CPU:
1183 WRITE_ONCE(sk->sk_incoming_cpu, val);
1184 break;
1185
1186 case SO_CNX_ADVICE:
1187 if (val == 1)
1188 dst_negative_advice(sk);
1189 break;
1190
1191 case SO_ZEROCOPY:
1192 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1193 if (!((sk->sk_type == SOCK_STREAM &&
1194 sk->sk_protocol == IPPROTO_TCP) ||
1195 (sk->sk_type == SOCK_DGRAM &&
1196 sk->sk_protocol == IPPROTO_UDP)))
1197 ret = -ENOTSUPP;
1198 } else if (sk->sk_family != PF_RDS) {
1199 ret = -ENOTSUPP;
1200 }
1201 if (!ret) {
1202 if (val < 0 || val > 1)
1203 ret = -EINVAL;
1204 else
1205 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1206 }
1207 break;
1208
1209 case SO_TXTIME:
1210 if (optlen != sizeof(struct sock_txtime)) {
1211 ret = -EINVAL;
1212 break;
1213 } else if (copy_from_sockptr(&sk_txtime, optval,
1214 sizeof(struct sock_txtime))) {
1215 ret = -EFAULT;
1216 break;
1217 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1218 ret = -EINVAL;
1219 break;
1220 }
1221 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1222 * scheduler has enough safe guards.
1223 */
1224 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1225 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1226 ret = -EPERM;
1227 break;
1228 }
1229 sock_valbool_flag(sk, SOCK_TXTIME, true);
1230 sk->sk_clockid = sk_txtime.clockid;
1231 sk->sk_txtime_deadline_mode =
1232 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1233 sk->sk_txtime_report_errors =
1234 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1235 break;
1236
1237 case SO_BINDTOIFINDEX:
1238 ret = sock_bindtoindex_locked(sk, val);
1239 break;
1240
1241 default:
1242 ret = -ENOPROTOOPT;
1243 break;
1244 }
1245 release_sock(sk);
1246 return ret;
1247 }
1248 EXPORT_SYMBOL(sock_setsockopt);
1249
1250
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1251 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1252 struct ucred *ucred)
1253 {
1254 ucred->pid = pid_vnr(pid);
1255 ucred->uid = ucred->gid = -1;
1256 if (cred) {
1257 struct user_namespace *current_ns = current_user_ns();
1258
1259 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1260 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1261 }
1262 }
1263
groups_to_user(gid_t __user * dst,const struct group_info * src)1264 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1265 {
1266 struct user_namespace *user_ns = current_user_ns();
1267 int i;
1268
1269 for (i = 0; i < src->ngroups; i++)
1270 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1271 return -EFAULT;
1272
1273 return 0;
1274 }
1275
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1276 int sock_getsockopt(struct socket *sock, int level, int optname,
1277 char __user *optval, int __user *optlen)
1278 {
1279 struct sock *sk = sock->sk;
1280
1281 union {
1282 int val;
1283 u64 val64;
1284 unsigned long ulval;
1285 struct linger ling;
1286 struct old_timeval32 tm32;
1287 struct __kernel_old_timeval tm;
1288 struct __kernel_sock_timeval stm;
1289 struct sock_txtime txtime;
1290 } v;
1291
1292 int lv = sizeof(int);
1293 int len;
1294
1295 if (get_user(len, optlen))
1296 return -EFAULT;
1297 if (len < 0)
1298 return -EINVAL;
1299
1300 memset(&v, 0, sizeof(v));
1301
1302 switch (optname) {
1303 case SO_DEBUG:
1304 v.val = sock_flag(sk, SOCK_DBG);
1305 break;
1306
1307 case SO_DONTROUTE:
1308 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1309 break;
1310
1311 case SO_BROADCAST:
1312 v.val = sock_flag(sk, SOCK_BROADCAST);
1313 break;
1314
1315 case SO_SNDBUF:
1316 v.val = sk->sk_sndbuf;
1317 break;
1318
1319 case SO_RCVBUF:
1320 v.val = sk->sk_rcvbuf;
1321 break;
1322
1323 case SO_REUSEADDR:
1324 v.val = sk->sk_reuse;
1325 break;
1326
1327 case SO_REUSEPORT:
1328 v.val = sk->sk_reuseport;
1329 break;
1330
1331 case SO_KEEPALIVE:
1332 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1333 break;
1334
1335 case SO_TYPE:
1336 v.val = sk->sk_type;
1337 break;
1338
1339 case SO_PROTOCOL:
1340 v.val = sk->sk_protocol;
1341 break;
1342
1343 case SO_DOMAIN:
1344 v.val = sk->sk_family;
1345 break;
1346
1347 case SO_ERROR:
1348 v.val = -sock_error(sk);
1349 if (v.val == 0)
1350 v.val = xchg(&sk->sk_err_soft, 0);
1351 break;
1352
1353 case SO_OOBINLINE:
1354 v.val = sock_flag(sk, SOCK_URGINLINE);
1355 break;
1356
1357 case SO_NO_CHECK:
1358 v.val = sk->sk_no_check_tx;
1359 break;
1360
1361 case SO_PRIORITY:
1362 v.val = sk->sk_priority;
1363 break;
1364
1365 case SO_LINGER:
1366 lv = sizeof(v.ling);
1367 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1368 v.ling.l_linger = sk->sk_lingertime / HZ;
1369 break;
1370
1371 case SO_BSDCOMPAT:
1372 break;
1373
1374 case SO_TIMESTAMP_OLD:
1375 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1376 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1377 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1378 break;
1379
1380 case SO_TIMESTAMPNS_OLD:
1381 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1382 break;
1383
1384 case SO_TIMESTAMP_NEW:
1385 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1386 break;
1387
1388 case SO_TIMESTAMPNS_NEW:
1389 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1390 break;
1391
1392 case SO_TIMESTAMPING_OLD:
1393 v.val = sk->sk_tsflags;
1394 break;
1395
1396 case SO_RCVTIMEO_OLD:
1397 case SO_RCVTIMEO_NEW:
1398 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1399 break;
1400
1401 case SO_SNDTIMEO_OLD:
1402 case SO_SNDTIMEO_NEW:
1403 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1404 break;
1405
1406 case SO_RCVLOWAT:
1407 v.val = sk->sk_rcvlowat;
1408 break;
1409
1410 case SO_SNDLOWAT:
1411 v.val = 1;
1412 break;
1413
1414 case SO_PASSCRED:
1415 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1416 break;
1417
1418 case SO_PEERCRED:
1419 {
1420 struct ucred peercred;
1421 if (len > sizeof(peercred))
1422 len = sizeof(peercred);
1423 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1424 if (copy_to_user(optval, &peercred, len))
1425 return -EFAULT;
1426 goto lenout;
1427 }
1428
1429 case SO_PEERGROUPS:
1430 {
1431 int ret, n;
1432
1433 if (!sk->sk_peer_cred)
1434 return -ENODATA;
1435
1436 n = sk->sk_peer_cred->group_info->ngroups;
1437 if (len < n * sizeof(gid_t)) {
1438 len = n * sizeof(gid_t);
1439 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1440 }
1441 len = n * sizeof(gid_t);
1442
1443 ret = groups_to_user((gid_t __user *)optval,
1444 sk->sk_peer_cred->group_info);
1445 if (ret)
1446 return ret;
1447 goto lenout;
1448 }
1449
1450 case SO_PEERNAME:
1451 {
1452 char address[128];
1453
1454 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1455 if (lv < 0)
1456 return -ENOTCONN;
1457 if (lv < len)
1458 return -EINVAL;
1459 if (copy_to_user(optval, address, len))
1460 return -EFAULT;
1461 goto lenout;
1462 }
1463
1464 /* Dubious BSD thing... Probably nobody even uses it, but
1465 * the UNIX standard wants it for whatever reason... -DaveM
1466 */
1467 case SO_ACCEPTCONN:
1468 v.val = sk->sk_state == TCP_LISTEN;
1469 break;
1470
1471 case SO_PASSSEC:
1472 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1473 break;
1474
1475 case SO_PEERSEC:
1476 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1477
1478 case SO_MARK:
1479 v.val = sk->sk_mark;
1480 break;
1481
1482 case SO_RXQ_OVFL:
1483 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1484 break;
1485
1486 case SO_WIFI_STATUS:
1487 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1488 break;
1489
1490 case SO_PEEK_OFF:
1491 if (!sock->ops->set_peek_off)
1492 return -EOPNOTSUPP;
1493
1494 v.val = sk->sk_peek_off;
1495 break;
1496 case SO_NOFCS:
1497 v.val = sock_flag(sk, SOCK_NOFCS);
1498 break;
1499
1500 case SO_BINDTODEVICE:
1501 return sock_getbindtodevice(sk, optval, optlen, len);
1502
1503 case SO_GET_FILTER:
1504 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1505 if (len < 0)
1506 return len;
1507
1508 goto lenout;
1509
1510 case SO_LOCK_FILTER:
1511 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1512 break;
1513
1514 case SO_BPF_EXTENSIONS:
1515 v.val = bpf_tell_extensions();
1516 break;
1517
1518 case SO_SELECT_ERR_QUEUE:
1519 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1520 break;
1521
1522 #ifdef CONFIG_NET_RX_BUSY_POLL
1523 case SO_BUSY_POLL:
1524 v.val = sk->sk_ll_usec;
1525 break;
1526 #endif
1527
1528 case SO_MAX_PACING_RATE:
1529 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1530 lv = sizeof(v.ulval);
1531 v.ulval = sk->sk_max_pacing_rate;
1532 } else {
1533 /* 32bit version */
1534 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1535 }
1536 break;
1537
1538 case SO_INCOMING_CPU:
1539 v.val = READ_ONCE(sk->sk_incoming_cpu);
1540 break;
1541
1542 case SO_MEMINFO:
1543 {
1544 u32 meminfo[SK_MEMINFO_VARS];
1545
1546 sk_get_meminfo(sk, meminfo);
1547
1548 len = min_t(unsigned int, len, sizeof(meminfo));
1549 if (copy_to_user(optval, &meminfo, len))
1550 return -EFAULT;
1551
1552 goto lenout;
1553 }
1554
1555 #ifdef CONFIG_NET_RX_BUSY_POLL
1556 case SO_INCOMING_NAPI_ID:
1557 v.val = READ_ONCE(sk->sk_napi_id);
1558
1559 /* aggregate non-NAPI IDs down to 0 */
1560 if (v.val < MIN_NAPI_ID)
1561 v.val = 0;
1562
1563 break;
1564 #endif
1565
1566 case SO_COOKIE:
1567 lv = sizeof(u64);
1568 if (len < lv)
1569 return -EINVAL;
1570 v.val64 = sock_gen_cookie(sk);
1571 break;
1572
1573 case SO_ZEROCOPY:
1574 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1575 break;
1576
1577 case SO_TXTIME:
1578 lv = sizeof(v.txtime);
1579 v.txtime.clockid = sk->sk_clockid;
1580 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1581 SOF_TXTIME_DEADLINE_MODE : 0;
1582 v.txtime.flags |= sk->sk_txtime_report_errors ?
1583 SOF_TXTIME_REPORT_ERRORS : 0;
1584 break;
1585
1586 case SO_BINDTOIFINDEX:
1587 v.val = sk->sk_bound_dev_if;
1588 break;
1589
1590 default:
1591 /* We implement the SO_SNDLOWAT etc to not be settable
1592 * (1003.1g 7).
1593 */
1594 return -ENOPROTOOPT;
1595 }
1596
1597 if (len > lv)
1598 len = lv;
1599 if (copy_to_user(optval, &v, len))
1600 return -EFAULT;
1601 lenout:
1602 if (put_user(len, optlen))
1603 return -EFAULT;
1604 return 0;
1605 }
1606
1607 /*
1608 * Initialize an sk_lock.
1609 *
1610 * (We also register the sk_lock with the lock validator.)
1611 */
sock_lock_init(struct sock * sk)1612 static inline void sock_lock_init(struct sock *sk)
1613 {
1614 if (sk->sk_kern_sock)
1615 sock_lock_init_class_and_name(
1616 sk,
1617 af_family_kern_slock_key_strings[sk->sk_family],
1618 af_family_kern_slock_keys + sk->sk_family,
1619 af_family_kern_key_strings[sk->sk_family],
1620 af_family_kern_keys + sk->sk_family);
1621 else
1622 sock_lock_init_class_and_name(
1623 sk,
1624 af_family_slock_key_strings[sk->sk_family],
1625 af_family_slock_keys + sk->sk_family,
1626 af_family_key_strings[sk->sk_family],
1627 af_family_keys + sk->sk_family);
1628 }
1629
1630 /*
1631 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1632 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1633 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1634 */
sock_copy(struct sock * nsk,const struct sock * osk)1635 static void sock_copy(struct sock *nsk, const struct sock *osk)
1636 {
1637 const struct proto *prot = READ_ONCE(osk->sk_prot);
1638 #ifdef CONFIG_SECURITY_NETWORK
1639 void *sptr = nsk->sk_security;
1640 #endif
1641 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1642
1643 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1644 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1645
1646 #ifdef CONFIG_SECURITY_NETWORK
1647 nsk->sk_security = sptr;
1648 security_sk_clone(osk, nsk);
1649 #endif
1650 }
1651
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1652 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1653 int family)
1654 {
1655 struct sock *sk;
1656 struct kmem_cache *slab;
1657
1658 slab = prot->slab;
1659 if (slab != NULL) {
1660 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1661 if (!sk)
1662 return sk;
1663 if (want_init_on_alloc(priority))
1664 sk_prot_clear_nulls(sk, prot->obj_size);
1665 } else
1666 sk = kmalloc(prot->obj_size, priority);
1667
1668 if (sk != NULL) {
1669 if (security_sk_alloc(sk, family, priority))
1670 goto out_free;
1671
1672 if (!try_module_get(prot->owner))
1673 goto out_free_sec;
1674 sk_tx_queue_clear(sk);
1675 }
1676
1677 return sk;
1678
1679 out_free_sec:
1680 security_sk_free(sk);
1681 out_free:
1682 if (slab != NULL)
1683 kmem_cache_free(slab, sk);
1684 else
1685 kfree(sk);
1686 return NULL;
1687 }
1688
sk_prot_free(struct proto * prot,struct sock * sk)1689 static void sk_prot_free(struct proto *prot, struct sock *sk)
1690 {
1691 struct kmem_cache *slab;
1692 struct module *owner;
1693
1694 owner = prot->owner;
1695 slab = prot->slab;
1696
1697 cgroup_sk_free(&sk->sk_cgrp_data);
1698 mem_cgroup_sk_free(sk);
1699 security_sk_free(sk);
1700 if (slab != NULL)
1701 kmem_cache_free(slab, sk);
1702 else
1703 kfree(sk);
1704 module_put(owner);
1705 }
1706
1707 /**
1708 * sk_alloc - All socket objects are allocated here
1709 * @net: the applicable net namespace
1710 * @family: protocol family
1711 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1712 * @prot: struct proto associated with this new sock instance
1713 * @kern: is this to be a kernel socket?
1714 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1715 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1716 struct proto *prot, int kern)
1717 {
1718 struct sock *sk;
1719
1720 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1721 if (sk) {
1722 sk->sk_family = family;
1723 /*
1724 * See comment in struct sock definition to understand
1725 * why we need sk_prot_creator -acme
1726 */
1727 sk->sk_prot = sk->sk_prot_creator = prot;
1728 sk->sk_kern_sock = kern;
1729 sock_lock_init(sk);
1730 sk->sk_net_refcnt = kern ? 0 : 1;
1731 if (likely(sk->sk_net_refcnt)) {
1732 get_net(net);
1733 sock_inuse_add(net, 1);
1734 }
1735
1736 sock_net_set(sk, net);
1737 refcount_set(&sk->sk_wmem_alloc, 1);
1738
1739 mem_cgroup_sk_alloc(sk);
1740 cgroup_sk_alloc(&sk->sk_cgrp_data);
1741 sock_update_classid(&sk->sk_cgrp_data);
1742 sock_update_netprioidx(&sk->sk_cgrp_data);
1743 sk_tx_queue_clear(sk);
1744 }
1745
1746 return sk;
1747 }
1748 EXPORT_SYMBOL(sk_alloc);
1749
1750 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1751 * grace period. This is the case for UDP sockets and TCP listeners.
1752 */
__sk_destruct(struct rcu_head * head)1753 static void __sk_destruct(struct rcu_head *head)
1754 {
1755 struct sock *sk = container_of(head, struct sock, sk_rcu);
1756 struct sk_filter *filter;
1757
1758 if (sk->sk_destruct)
1759 sk->sk_destruct(sk);
1760
1761 filter = rcu_dereference_check(sk->sk_filter,
1762 refcount_read(&sk->sk_wmem_alloc) == 0);
1763 if (filter) {
1764 sk_filter_uncharge(sk, filter);
1765 RCU_INIT_POINTER(sk->sk_filter, NULL);
1766 }
1767
1768 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1769
1770 #ifdef CONFIG_BPF_SYSCALL
1771 bpf_sk_storage_free(sk);
1772 #endif
1773
1774 if (atomic_read(&sk->sk_omem_alloc))
1775 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1776 __func__, atomic_read(&sk->sk_omem_alloc));
1777
1778 if (sk->sk_frag.page) {
1779 put_page(sk->sk_frag.page);
1780 sk->sk_frag.page = NULL;
1781 }
1782
1783 if (sk->sk_peer_cred)
1784 put_cred(sk->sk_peer_cred);
1785 put_pid(sk->sk_peer_pid);
1786 if (likely(sk->sk_net_refcnt))
1787 put_net(sock_net(sk));
1788 sk_prot_free(sk->sk_prot_creator, sk);
1789 }
1790
sk_destruct(struct sock * sk)1791 void sk_destruct(struct sock *sk)
1792 {
1793 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1794
1795 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1796 reuseport_detach_sock(sk);
1797 use_call_rcu = true;
1798 }
1799
1800 if (use_call_rcu)
1801 call_rcu(&sk->sk_rcu, __sk_destruct);
1802 else
1803 __sk_destruct(&sk->sk_rcu);
1804 }
1805
__sk_free(struct sock * sk)1806 static void __sk_free(struct sock *sk)
1807 {
1808 if (likely(sk->sk_net_refcnt))
1809 sock_inuse_add(sock_net(sk), -1);
1810
1811 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1812 sock_diag_broadcast_destroy(sk);
1813 else
1814 sk_destruct(sk);
1815 }
1816
sk_free(struct sock * sk)1817 void sk_free(struct sock *sk)
1818 {
1819 /*
1820 * We subtract one from sk_wmem_alloc and can know if
1821 * some packets are still in some tx queue.
1822 * If not null, sock_wfree() will call __sk_free(sk) later
1823 */
1824 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1825 __sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sk_free);
1828
sk_init_common(struct sock * sk)1829 static void sk_init_common(struct sock *sk)
1830 {
1831 skb_queue_head_init(&sk->sk_receive_queue);
1832 skb_queue_head_init(&sk->sk_write_queue);
1833 skb_queue_head_init(&sk->sk_error_queue);
1834
1835 rwlock_init(&sk->sk_callback_lock);
1836 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1837 af_rlock_keys + sk->sk_family,
1838 af_family_rlock_key_strings[sk->sk_family]);
1839 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1840 af_wlock_keys + sk->sk_family,
1841 af_family_wlock_key_strings[sk->sk_family]);
1842 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1843 af_elock_keys + sk->sk_family,
1844 af_family_elock_key_strings[sk->sk_family]);
1845 lockdep_set_class_and_name(&sk->sk_callback_lock,
1846 af_callback_keys + sk->sk_family,
1847 af_family_clock_key_strings[sk->sk_family]);
1848 }
1849
1850 /**
1851 * sk_clone_lock - clone a socket, and lock its clone
1852 * @sk: the socket to clone
1853 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1854 *
1855 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1856 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1857 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1858 {
1859 struct proto *prot = READ_ONCE(sk->sk_prot);
1860 struct sock *newsk;
1861 bool is_charged = true;
1862
1863 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1864 if (newsk != NULL) {
1865 struct sk_filter *filter;
1866
1867 sock_copy(newsk, sk);
1868
1869 newsk->sk_prot_creator = prot;
1870
1871 /* SANITY */
1872 if (likely(newsk->sk_net_refcnt))
1873 get_net(sock_net(newsk));
1874 sk_node_init(&newsk->sk_node);
1875 sock_lock_init(newsk);
1876 bh_lock_sock(newsk);
1877 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1878 newsk->sk_backlog.len = 0;
1879
1880 atomic_set(&newsk->sk_rmem_alloc, 0);
1881 /*
1882 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1883 */
1884 refcount_set(&newsk->sk_wmem_alloc, 1);
1885 atomic_set(&newsk->sk_omem_alloc, 0);
1886 sk_init_common(newsk);
1887
1888 newsk->sk_dst_cache = NULL;
1889 newsk->sk_dst_pending_confirm = 0;
1890 newsk->sk_wmem_queued = 0;
1891 newsk->sk_forward_alloc = 0;
1892 atomic_set(&newsk->sk_drops, 0);
1893 newsk->sk_send_head = NULL;
1894 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1895 atomic_set(&newsk->sk_zckey, 0);
1896
1897 sock_reset_flag(newsk, SOCK_DONE);
1898
1899 /* sk->sk_memcg will be populated at accept() time */
1900 newsk->sk_memcg = NULL;
1901
1902 cgroup_sk_clone(&newsk->sk_cgrp_data);
1903
1904 rcu_read_lock();
1905 filter = rcu_dereference(sk->sk_filter);
1906 if (filter != NULL)
1907 /* though it's an empty new sock, the charging may fail
1908 * if sysctl_optmem_max was changed between creation of
1909 * original socket and cloning
1910 */
1911 is_charged = sk_filter_charge(newsk, filter);
1912 RCU_INIT_POINTER(newsk->sk_filter, filter);
1913 rcu_read_unlock();
1914
1915 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1916 /* We need to make sure that we don't uncharge the new
1917 * socket if we couldn't charge it in the first place
1918 * as otherwise we uncharge the parent's filter.
1919 */
1920 if (!is_charged)
1921 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1922 sk_free_unlock_clone(newsk);
1923 newsk = NULL;
1924 goto out;
1925 }
1926 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1927
1928 if (bpf_sk_storage_clone(sk, newsk)) {
1929 sk_free_unlock_clone(newsk);
1930 newsk = NULL;
1931 goto out;
1932 }
1933
1934 /* Clear sk_user_data if parent had the pointer tagged
1935 * as not suitable for copying when cloning.
1936 */
1937 if (sk_user_data_is_nocopy(newsk))
1938 newsk->sk_user_data = NULL;
1939
1940 newsk->sk_err = 0;
1941 newsk->sk_err_soft = 0;
1942 newsk->sk_priority = 0;
1943 newsk->sk_incoming_cpu = raw_smp_processor_id();
1944 if (likely(newsk->sk_net_refcnt))
1945 sock_inuse_add(sock_net(newsk), 1);
1946
1947 /*
1948 * Before updating sk_refcnt, we must commit prior changes to memory
1949 * (Documentation/RCU/rculist_nulls.rst for details)
1950 */
1951 smp_wmb();
1952 refcount_set(&newsk->sk_refcnt, 2);
1953
1954 /*
1955 * Increment the counter in the same struct proto as the master
1956 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1957 * is the same as sk->sk_prot->socks, as this field was copied
1958 * with memcpy).
1959 *
1960 * This _changes_ the previous behaviour, where
1961 * tcp_create_openreq_child always was incrementing the
1962 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1963 * to be taken into account in all callers. -acme
1964 */
1965 sk_refcnt_debug_inc(newsk);
1966 sk_set_socket(newsk, NULL);
1967 sk_tx_queue_clear(newsk);
1968 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1969
1970 if (newsk->sk_prot->sockets_allocated)
1971 sk_sockets_allocated_inc(newsk);
1972
1973 if (sock_needs_netstamp(sk) &&
1974 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1975 net_enable_timestamp();
1976 }
1977 out:
1978 return newsk;
1979 }
1980 EXPORT_SYMBOL_GPL(sk_clone_lock);
1981
sk_free_unlock_clone(struct sock * sk)1982 void sk_free_unlock_clone(struct sock *sk)
1983 {
1984 /* It is still raw copy of parent, so invalidate
1985 * destructor and make plain sk_free() */
1986 sk->sk_destruct = NULL;
1987 bh_unlock_sock(sk);
1988 sk_free(sk);
1989 }
1990 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1991
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1992 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1993 {
1994 u32 max_segs = 1;
1995
1996 sk_dst_set(sk, dst);
1997 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1998 if (sk->sk_route_caps & NETIF_F_GSO)
1999 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2000 sk->sk_route_caps &= ~sk->sk_route_nocaps;
2001 if (sk_can_gso(sk)) {
2002 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2003 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2004 } else {
2005 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2006 sk->sk_gso_max_size = dst->dev->gso_max_size;
2007 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2008 }
2009 }
2010 sk->sk_gso_max_segs = max_segs;
2011 }
2012 EXPORT_SYMBOL_GPL(sk_setup_caps);
2013
2014 /*
2015 * Simple resource managers for sockets.
2016 */
2017
2018
2019 /*
2020 * Write buffer destructor automatically called from kfree_skb.
2021 */
sock_wfree(struct sk_buff * skb)2022 void sock_wfree(struct sk_buff *skb)
2023 {
2024 struct sock *sk = skb->sk;
2025 unsigned int len = skb->truesize;
2026
2027 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2028 /*
2029 * Keep a reference on sk_wmem_alloc, this will be released
2030 * after sk_write_space() call
2031 */
2032 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2033 sk->sk_write_space(sk);
2034 len = 1;
2035 }
2036 /*
2037 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2038 * could not do because of in-flight packets
2039 */
2040 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2041 __sk_free(sk);
2042 }
2043 EXPORT_SYMBOL(sock_wfree);
2044
2045 /* This variant of sock_wfree() is used by TCP,
2046 * since it sets SOCK_USE_WRITE_QUEUE.
2047 */
__sock_wfree(struct sk_buff * skb)2048 void __sock_wfree(struct sk_buff *skb)
2049 {
2050 struct sock *sk = skb->sk;
2051
2052 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2053 __sk_free(sk);
2054 }
2055
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2056 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2057 {
2058 skb_orphan(skb);
2059 skb->sk = sk;
2060 #ifdef CONFIG_INET
2061 if (unlikely(!sk_fullsock(sk))) {
2062 skb->destructor = sock_edemux;
2063 sock_hold(sk);
2064 return;
2065 }
2066 #endif
2067 skb->destructor = sock_wfree;
2068 skb_set_hash_from_sk(skb, sk);
2069 /*
2070 * We used to take a refcount on sk, but following operation
2071 * is enough to guarantee sk_free() wont free this sock until
2072 * all in-flight packets are completed
2073 */
2074 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2075 }
2076 EXPORT_SYMBOL(skb_set_owner_w);
2077
can_skb_orphan_partial(const struct sk_buff * skb)2078 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2079 {
2080 #ifdef CONFIG_TLS_DEVICE
2081 /* Drivers depend on in-order delivery for crypto offload,
2082 * partial orphan breaks out-of-order-OK logic.
2083 */
2084 if (skb->decrypted)
2085 return false;
2086 #endif
2087 return (skb->destructor == sock_wfree ||
2088 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2089 }
2090
2091 /* This helper is used by netem, as it can hold packets in its
2092 * delay queue. We want to allow the owner socket to send more
2093 * packets, as if they were already TX completed by a typical driver.
2094 * But we also want to keep skb->sk set because some packet schedulers
2095 * rely on it (sch_fq for example).
2096 */
skb_orphan_partial(struct sk_buff * skb)2097 void skb_orphan_partial(struct sk_buff *skb)
2098 {
2099 if (skb_is_tcp_pure_ack(skb))
2100 return;
2101
2102 if (can_skb_orphan_partial(skb)) {
2103 struct sock *sk = skb->sk;
2104
2105 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2106 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2107 skb->destructor = sock_efree;
2108 }
2109 } else {
2110 skb_orphan(skb);
2111 }
2112 }
2113 EXPORT_SYMBOL(skb_orphan_partial);
2114
2115 /*
2116 * Read buffer destructor automatically called from kfree_skb.
2117 */
sock_rfree(struct sk_buff * skb)2118 void sock_rfree(struct sk_buff *skb)
2119 {
2120 struct sock *sk = skb->sk;
2121 unsigned int len = skb->truesize;
2122
2123 atomic_sub(len, &sk->sk_rmem_alloc);
2124 sk_mem_uncharge(sk, len);
2125 }
2126 EXPORT_SYMBOL(sock_rfree);
2127
2128 /*
2129 * Buffer destructor for skbs that are not used directly in read or write
2130 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2131 */
sock_efree(struct sk_buff * skb)2132 void sock_efree(struct sk_buff *skb)
2133 {
2134 sock_put(skb->sk);
2135 }
2136 EXPORT_SYMBOL(sock_efree);
2137
2138 /* Buffer destructor for prefetch/receive path where reference count may
2139 * not be held, e.g. for listen sockets.
2140 */
2141 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2142 void sock_pfree(struct sk_buff *skb)
2143 {
2144 if (sk_is_refcounted(skb->sk))
2145 sock_gen_put(skb->sk);
2146 }
2147 EXPORT_SYMBOL(sock_pfree);
2148 #endif /* CONFIG_INET */
2149
sock_i_uid(struct sock * sk)2150 kuid_t sock_i_uid(struct sock *sk)
2151 {
2152 kuid_t uid;
2153
2154 read_lock_bh(&sk->sk_callback_lock);
2155 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2156 read_unlock_bh(&sk->sk_callback_lock);
2157 return uid;
2158 }
2159 EXPORT_SYMBOL(sock_i_uid);
2160
sock_i_ino(struct sock * sk)2161 unsigned long sock_i_ino(struct sock *sk)
2162 {
2163 unsigned long ino;
2164
2165 read_lock_bh(&sk->sk_callback_lock);
2166 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2167 read_unlock_bh(&sk->sk_callback_lock);
2168 return ino;
2169 }
2170 EXPORT_SYMBOL(sock_i_ino);
2171
2172 /*
2173 * Allocate a skb from the socket's send buffer.
2174 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2175 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2176 gfp_t priority)
2177 {
2178 if (force ||
2179 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2180 struct sk_buff *skb = alloc_skb(size, priority);
2181
2182 if (skb) {
2183 skb_set_owner_w(skb, sk);
2184 return skb;
2185 }
2186 }
2187 return NULL;
2188 }
2189 EXPORT_SYMBOL(sock_wmalloc);
2190
sock_ofree(struct sk_buff * skb)2191 static void sock_ofree(struct sk_buff *skb)
2192 {
2193 struct sock *sk = skb->sk;
2194
2195 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2196 }
2197
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2198 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2199 gfp_t priority)
2200 {
2201 struct sk_buff *skb;
2202
2203 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2204 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2205 sysctl_optmem_max)
2206 return NULL;
2207
2208 skb = alloc_skb(size, priority);
2209 if (!skb)
2210 return NULL;
2211
2212 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2213 skb->sk = sk;
2214 skb->destructor = sock_ofree;
2215 return skb;
2216 }
2217
2218 /*
2219 * Allocate a memory block from the socket's option memory buffer.
2220 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2221 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2222 {
2223 if ((unsigned int)size <= sysctl_optmem_max &&
2224 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2225 void *mem;
2226 /* First do the add, to avoid the race if kmalloc
2227 * might sleep.
2228 */
2229 atomic_add(size, &sk->sk_omem_alloc);
2230 mem = kmalloc(size, priority);
2231 if (mem)
2232 return mem;
2233 atomic_sub(size, &sk->sk_omem_alloc);
2234 }
2235 return NULL;
2236 }
2237 EXPORT_SYMBOL(sock_kmalloc);
2238
2239 /* Free an option memory block. Note, we actually want the inline
2240 * here as this allows gcc to detect the nullify and fold away the
2241 * condition entirely.
2242 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2243 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2244 const bool nullify)
2245 {
2246 if (WARN_ON_ONCE(!mem))
2247 return;
2248 if (nullify)
2249 kfree_sensitive(mem);
2250 else
2251 kfree(mem);
2252 atomic_sub(size, &sk->sk_omem_alloc);
2253 }
2254
sock_kfree_s(struct sock * sk,void * mem,int size)2255 void sock_kfree_s(struct sock *sk, void *mem, int size)
2256 {
2257 __sock_kfree_s(sk, mem, size, false);
2258 }
2259 EXPORT_SYMBOL(sock_kfree_s);
2260
sock_kzfree_s(struct sock * sk,void * mem,int size)2261 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2262 {
2263 __sock_kfree_s(sk, mem, size, true);
2264 }
2265 EXPORT_SYMBOL(sock_kzfree_s);
2266
2267 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2268 I think, these locks should be removed for datagram sockets.
2269 */
sock_wait_for_wmem(struct sock * sk,long timeo)2270 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2271 {
2272 DEFINE_WAIT(wait);
2273
2274 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2275 for (;;) {
2276 if (!timeo)
2277 break;
2278 if (signal_pending(current))
2279 break;
2280 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2281 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2282 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2283 break;
2284 if (sk->sk_shutdown & SEND_SHUTDOWN)
2285 break;
2286 if (sk->sk_err)
2287 break;
2288 timeo = schedule_timeout(timeo);
2289 }
2290 finish_wait(sk_sleep(sk), &wait);
2291 return timeo;
2292 }
2293
2294
2295 /*
2296 * Generic send/receive buffer handlers
2297 */
2298
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2299 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2300 unsigned long data_len, int noblock,
2301 int *errcode, int max_page_order)
2302 {
2303 struct sk_buff *skb;
2304 long timeo;
2305 int err;
2306
2307 timeo = sock_sndtimeo(sk, noblock);
2308 for (;;) {
2309 err = sock_error(sk);
2310 if (err != 0)
2311 goto failure;
2312
2313 err = -EPIPE;
2314 if (sk->sk_shutdown & SEND_SHUTDOWN)
2315 goto failure;
2316
2317 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2318 break;
2319
2320 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2321 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2322 err = -EAGAIN;
2323 if (!timeo)
2324 goto failure;
2325 if (signal_pending(current))
2326 goto interrupted;
2327 timeo = sock_wait_for_wmem(sk, timeo);
2328 }
2329 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2330 errcode, sk->sk_allocation);
2331 if (skb)
2332 skb_set_owner_w(skb, sk);
2333 return skb;
2334
2335 interrupted:
2336 err = sock_intr_errno(timeo);
2337 failure:
2338 *errcode = err;
2339 return NULL;
2340 }
2341 EXPORT_SYMBOL(sock_alloc_send_pskb);
2342
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2343 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2344 int noblock, int *errcode)
2345 {
2346 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2347 }
2348 EXPORT_SYMBOL(sock_alloc_send_skb);
2349
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2350 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2351 struct sockcm_cookie *sockc)
2352 {
2353 u32 tsflags;
2354
2355 switch (cmsg->cmsg_type) {
2356 case SO_MARK:
2357 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2358 return -EPERM;
2359 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2360 return -EINVAL;
2361 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2362 break;
2363 case SO_TIMESTAMPING_OLD:
2364 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2365 return -EINVAL;
2366
2367 tsflags = *(u32 *)CMSG_DATA(cmsg);
2368 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2369 return -EINVAL;
2370
2371 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2372 sockc->tsflags |= tsflags;
2373 break;
2374 case SCM_TXTIME:
2375 if (!sock_flag(sk, SOCK_TXTIME))
2376 return -EINVAL;
2377 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2378 return -EINVAL;
2379 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2380 break;
2381 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2382 case SCM_RIGHTS:
2383 case SCM_CREDENTIALS:
2384 break;
2385 default:
2386 return -EINVAL;
2387 }
2388 return 0;
2389 }
2390 EXPORT_SYMBOL(__sock_cmsg_send);
2391
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2392 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2393 struct sockcm_cookie *sockc)
2394 {
2395 struct cmsghdr *cmsg;
2396 int ret;
2397
2398 for_each_cmsghdr(cmsg, msg) {
2399 if (!CMSG_OK(msg, cmsg))
2400 return -EINVAL;
2401 if (cmsg->cmsg_level != SOL_SOCKET)
2402 continue;
2403 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2404 if (ret)
2405 return ret;
2406 }
2407 return 0;
2408 }
2409 EXPORT_SYMBOL(sock_cmsg_send);
2410
sk_enter_memory_pressure(struct sock * sk)2411 static void sk_enter_memory_pressure(struct sock *sk)
2412 {
2413 if (!sk->sk_prot->enter_memory_pressure)
2414 return;
2415
2416 sk->sk_prot->enter_memory_pressure(sk);
2417 }
2418
sk_leave_memory_pressure(struct sock * sk)2419 static void sk_leave_memory_pressure(struct sock *sk)
2420 {
2421 if (sk->sk_prot->leave_memory_pressure) {
2422 sk->sk_prot->leave_memory_pressure(sk);
2423 } else {
2424 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2425
2426 if (memory_pressure && READ_ONCE(*memory_pressure))
2427 WRITE_ONCE(*memory_pressure, 0);
2428 }
2429 }
2430
2431 #define SKB_FRAG_PAGE_ORDER get_order(32768)
2432 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2433
2434 /**
2435 * skb_page_frag_refill - check that a page_frag contains enough room
2436 * @sz: minimum size of the fragment we want to get
2437 * @pfrag: pointer to page_frag
2438 * @gfp: priority for memory allocation
2439 *
2440 * Note: While this allocator tries to use high order pages, there is
2441 * no guarantee that allocations succeed. Therefore, @sz MUST be
2442 * less or equal than PAGE_SIZE.
2443 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2444 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2445 {
2446 if (pfrag->page) {
2447 if (page_ref_count(pfrag->page) == 1) {
2448 pfrag->offset = 0;
2449 return true;
2450 }
2451 if (pfrag->offset + sz <= pfrag->size)
2452 return true;
2453 put_page(pfrag->page);
2454 }
2455
2456 pfrag->offset = 0;
2457 if (SKB_FRAG_PAGE_ORDER &&
2458 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2459 /* Avoid direct reclaim but allow kswapd to wake */
2460 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2461 __GFP_COMP | __GFP_NOWARN |
2462 __GFP_NORETRY,
2463 SKB_FRAG_PAGE_ORDER);
2464 if (likely(pfrag->page)) {
2465 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2466 return true;
2467 }
2468 }
2469 pfrag->page = alloc_page(gfp);
2470 if (likely(pfrag->page)) {
2471 pfrag->size = PAGE_SIZE;
2472 return true;
2473 }
2474 return false;
2475 }
2476 EXPORT_SYMBOL(skb_page_frag_refill);
2477
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2478 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2479 {
2480 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2481 return true;
2482
2483 sk_enter_memory_pressure(sk);
2484 sk_stream_moderate_sndbuf(sk);
2485 return false;
2486 }
2487 EXPORT_SYMBOL(sk_page_frag_refill);
2488
__lock_sock(struct sock * sk)2489 static void __lock_sock(struct sock *sk)
2490 __releases(&sk->sk_lock.slock)
2491 __acquires(&sk->sk_lock.slock)
2492 {
2493 DEFINE_WAIT(wait);
2494
2495 for (;;) {
2496 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2497 TASK_UNINTERRUPTIBLE);
2498 spin_unlock_bh(&sk->sk_lock.slock);
2499 schedule();
2500 spin_lock_bh(&sk->sk_lock.slock);
2501 if (!sock_owned_by_user(sk))
2502 break;
2503 }
2504 finish_wait(&sk->sk_lock.wq, &wait);
2505 }
2506
__release_sock(struct sock * sk)2507 void __release_sock(struct sock *sk)
2508 __releases(&sk->sk_lock.slock)
2509 __acquires(&sk->sk_lock.slock)
2510 {
2511 struct sk_buff *skb, *next;
2512
2513 while ((skb = sk->sk_backlog.head) != NULL) {
2514 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2515
2516 spin_unlock_bh(&sk->sk_lock.slock);
2517
2518 do {
2519 next = skb->next;
2520 prefetch(next);
2521 WARN_ON_ONCE(skb_dst_is_noref(skb));
2522 skb_mark_not_on_list(skb);
2523 sk_backlog_rcv(sk, skb);
2524
2525 cond_resched();
2526
2527 skb = next;
2528 } while (skb != NULL);
2529
2530 spin_lock_bh(&sk->sk_lock.slock);
2531 }
2532
2533 /*
2534 * Doing the zeroing here guarantee we can not loop forever
2535 * while a wild producer attempts to flood us.
2536 */
2537 sk->sk_backlog.len = 0;
2538 }
2539
__sk_flush_backlog(struct sock * sk)2540 void __sk_flush_backlog(struct sock *sk)
2541 {
2542 spin_lock_bh(&sk->sk_lock.slock);
2543 __release_sock(sk);
2544 spin_unlock_bh(&sk->sk_lock.slock);
2545 }
2546
2547 /**
2548 * sk_wait_data - wait for data to arrive at sk_receive_queue
2549 * @sk: sock to wait on
2550 * @timeo: for how long
2551 * @skb: last skb seen on sk_receive_queue
2552 *
2553 * Now socket state including sk->sk_err is changed only under lock,
2554 * hence we may omit checks after joining wait queue.
2555 * We check receive queue before schedule() only as optimization;
2556 * it is very likely that release_sock() added new data.
2557 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2558 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2559 {
2560 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2561 int rc;
2562
2563 add_wait_queue(sk_sleep(sk), &wait);
2564 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2565 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2566 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2567 remove_wait_queue(sk_sleep(sk), &wait);
2568 return rc;
2569 }
2570 EXPORT_SYMBOL(sk_wait_data);
2571
2572 /**
2573 * __sk_mem_raise_allocated - increase memory_allocated
2574 * @sk: socket
2575 * @size: memory size to allocate
2576 * @amt: pages to allocate
2577 * @kind: allocation type
2578 *
2579 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2580 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2581 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2582 {
2583 struct proto *prot = sk->sk_prot;
2584 long allocated = sk_memory_allocated_add(sk, amt);
2585 bool charged = true;
2586
2587 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2588 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2589 goto suppress_allocation;
2590
2591 /* Under limit. */
2592 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2593 sk_leave_memory_pressure(sk);
2594 return 1;
2595 }
2596
2597 /* Under pressure. */
2598 if (allocated > sk_prot_mem_limits(sk, 1))
2599 sk_enter_memory_pressure(sk);
2600
2601 /* Over hard limit. */
2602 if (allocated > sk_prot_mem_limits(sk, 2))
2603 goto suppress_allocation;
2604
2605 /* guarantee minimum buffer size under pressure */
2606 if (kind == SK_MEM_RECV) {
2607 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2608 return 1;
2609
2610 } else { /* SK_MEM_SEND */
2611 int wmem0 = sk_get_wmem0(sk, prot);
2612
2613 if (sk->sk_type == SOCK_STREAM) {
2614 if (sk->sk_wmem_queued < wmem0)
2615 return 1;
2616 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2617 return 1;
2618 }
2619 }
2620
2621 if (sk_has_memory_pressure(sk)) {
2622 u64 alloc;
2623
2624 if (!sk_under_memory_pressure(sk))
2625 return 1;
2626 alloc = sk_sockets_allocated_read_positive(sk);
2627 if (sk_prot_mem_limits(sk, 2) > alloc *
2628 sk_mem_pages(sk->sk_wmem_queued +
2629 atomic_read(&sk->sk_rmem_alloc) +
2630 sk->sk_forward_alloc))
2631 return 1;
2632 }
2633
2634 suppress_allocation:
2635
2636 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2637 sk_stream_moderate_sndbuf(sk);
2638
2639 /* Fail only if socket is _under_ its sndbuf.
2640 * In this case we cannot block, so that we have to fail.
2641 */
2642 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2643 return 1;
2644 }
2645
2646 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2647 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2648
2649 sk_memory_allocated_sub(sk, amt);
2650
2651 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2652 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2653
2654 return 0;
2655 }
2656 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2657
2658 /**
2659 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2660 * @sk: socket
2661 * @size: memory size to allocate
2662 * @kind: allocation type
2663 *
2664 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2665 * rmem allocation. This function assumes that protocols which have
2666 * memory_pressure use sk_wmem_queued as write buffer accounting.
2667 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2668 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2669 {
2670 int ret, amt = sk_mem_pages(size);
2671
2672 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2673 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2674 if (!ret)
2675 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2676 return ret;
2677 }
2678 EXPORT_SYMBOL(__sk_mem_schedule);
2679
2680 /**
2681 * __sk_mem_reduce_allocated - reclaim memory_allocated
2682 * @sk: socket
2683 * @amount: number of quanta
2684 *
2685 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2686 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2687 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2688 {
2689 sk_memory_allocated_sub(sk, amount);
2690
2691 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2692 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2693
2694 if (sk_under_memory_pressure(sk) &&
2695 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2696 sk_leave_memory_pressure(sk);
2697 }
2698 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2699
2700 /**
2701 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2702 * @sk: socket
2703 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2704 */
__sk_mem_reclaim(struct sock * sk,int amount)2705 void __sk_mem_reclaim(struct sock *sk, int amount)
2706 {
2707 amount >>= SK_MEM_QUANTUM_SHIFT;
2708 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2709 __sk_mem_reduce_allocated(sk, amount);
2710 }
2711 EXPORT_SYMBOL(__sk_mem_reclaim);
2712
sk_set_peek_off(struct sock * sk,int val)2713 int sk_set_peek_off(struct sock *sk, int val)
2714 {
2715 sk->sk_peek_off = val;
2716 return 0;
2717 }
2718 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2719
2720 /*
2721 * Set of default routines for initialising struct proto_ops when
2722 * the protocol does not support a particular function. In certain
2723 * cases where it makes no sense for a protocol to have a "do nothing"
2724 * function, some default processing is provided.
2725 */
2726
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2727 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2728 {
2729 return -EOPNOTSUPP;
2730 }
2731 EXPORT_SYMBOL(sock_no_bind);
2732
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2733 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2734 int len, int flags)
2735 {
2736 return -EOPNOTSUPP;
2737 }
2738 EXPORT_SYMBOL(sock_no_connect);
2739
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2740 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2741 {
2742 return -EOPNOTSUPP;
2743 }
2744 EXPORT_SYMBOL(sock_no_socketpair);
2745
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2746 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2747 bool kern)
2748 {
2749 return -EOPNOTSUPP;
2750 }
2751 EXPORT_SYMBOL(sock_no_accept);
2752
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2753 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2754 int peer)
2755 {
2756 return -EOPNOTSUPP;
2757 }
2758 EXPORT_SYMBOL(sock_no_getname);
2759
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2760 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2761 {
2762 return -EOPNOTSUPP;
2763 }
2764 EXPORT_SYMBOL(sock_no_ioctl);
2765
sock_no_listen(struct socket * sock,int backlog)2766 int sock_no_listen(struct socket *sock, int backlog)
2767 {
2768 return -EOPNOTSUPP;
2769 }
2770 EXPORT_SYMBOL(sock_no_listen);
2771
sock_no_shutdown(struct socket * sock,int how)2772 int sock_no_shutdown(struct socket *sock, int how)
2773 {
2774 return -EOPNOTSUPP;
2775 }
2776 EXPORT_SYMBOL(sock_no_shutdown);
2777
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2778 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2779 {
2780 return -EOPNOTSUPP;
2781 }
2782 EXPORT_SYMBOL(sock_no_sendmsg);
2783
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2784 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2785 {
2786 return -EOPNOTSUPP;
2787 }
2788 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2789
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2790 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2791 int flags)
2792 {
2793 return -EOPNOTSUPP;
2794 }
2795 EXPORT_SYMBOL(sock_no_recvmsg);
2796
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2797 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2798 {
2799 /* Mirror missing mmap method error code */
2800 return -ENODEV;
2801 }
2802 EXPORT_SYMBOL(sock_no_mmap);
2803
2804 /*
2805 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2806 * various sock-based usage counts.
2807 */
__receive_sock(struct file * file)2808 void __receive_sock(struct file *file)
2809 {
2810 struct socket *sock;
2811 int error;
2812
2813 /*
2814 * The resulting value of "error" is ignored here since we only
2815 * need to take action when the file is a socket and testing
2816 * "sock" for NULL is sufficient.
2817 */
2818 sock = sock_from_file(file, &error);
2819 if (sock) {
2820 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2821 sock_update_classid(&sock->sk->sk_cgrp_data);
2822 }
2823 }
2824
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2825 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2826 {
2827 ssize_t res;
2828 struct msghdr msg = {.msg_flags = flags};
2829 struct kvec iov;
2830 char *kaddr = kmap(page);
2831 iov.iov_base = kaddr + offset;
2832 iov.iov_len = size;
2833 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2834 kunmap(page);
2835 return res;
2836 }
2837 EXPORT_SYMBOL(sock_no_sendpage);
2838
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2839 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2840 int offset, size_t size, int flags)
2841 {
2842 ssize_t res;
2843 struct msghdr msg = {.msg_flags = flags};
2844 struct kvec iov;
2845 char *kaddr = kmap(page);
2846
2847 iov.iov_base = kaddr + offset;
2848 iov.iov_len = size;
2849 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2850 kunmap(page);
2851 return res;
2852 }
2853 EXPORT_SYMBOL(sock_no_sendpage_locked);
2854
2855 /*
2856 * Default Socket Callbacks
2857 */
2858
sock_def_wakeup(struct sock * sk)2859 static void sock_def_wakeup(struct sock *sk)
2860 {
2861 struct socket_wq *wq;
2862
2863 rcu_read_lock();
2864 wq = rcu_dereference(sk->sk_wq);
2865 if (skwq_has_sleeper(wq))
2866 wake_up_interruptible_all(&wq->wait);
2867 rcu_read_unlock();
2868 }
2869
sock_def_error_report(struct sock * sk)2870 static void sock_def_error_report(struct sock *sk)
2871 {
2872 struct socket_wq *wq;
2873
2874 rcu_read_lock();
2875 wq = rcu_dereference(sk->sk_wq);
2876 if (skwq_has_sleeper(wq))
2877 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2878 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2879 rcu_read_unlock();
2880 }
2881
sock_def_readable(struct sock * sk)2882 void sock_def_readable(struct sock *sk)
2883 {
2884 struct socket_wq *wq;
2885
2886 rcu_read_lock();
2887 wq = rcu_dereference(sk->sk_wq);
2888 if (skwq_has_sleeper(wq))
2889 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2890 EPOLLRDNORM | EPOLLRDBAND);
2891 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2892 rcu_read_unlock();
2893 }
2894
sock_def_write_space(struct sock * sk)2895 static void sock_def_write_space(struct sock *sk)
2896 {
2897 struct socket_wq *wq;
2898
2899 rcu_read_lock();
2900
2901 /* Do not wake up a writer until he can make "significant"
2902 * progress. --DaveM
2903 */
2904 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2905 wq = rcu_dereference(sk->sk_wq);
2906 if (skwq_has_sleeper(wq))
2907 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2908 EPOLLWRNORM | EPOLLWRBAND);
2909
2910 /* Should agree with poll, otherwise some programs break */
2911 if (sock_writeable(sk))
2912 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2913 }
2914
2915 rcu_read_unlock();
2916 }
2917
sock_def_destruct(struct sock * sk)2918 static void sock_def_destruct(struct sock *sk)
2919 {
2920 }
2921
sk_send_sigurg(struct sock * sk)2922 void sk_send_sigurg(struct sock *sk)
2923 {
2924 if (sk->sk_socket && sk->sk_socket->file)
2925 if (send_sigurg(&sk->sk_socket->file->f_owner))
2926 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2927 }
2928 EXPORT_SYMBOL(sk_send_sigurg);
2929
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2930 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2931 unsigned long expires)
2932 {
2933 if (!mod_timer(timer, expires))
2934 sock_hold(sk);
2935 }
2936 EXPORT_SYMBOL(sk_reset_timer);
2937
sk_stop_timer(struct sock * sk,struct timer_list * timer)2938 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2939 {
2940 if (del_timer(timer))
2941 __sock_put(sk);
2942 }
2943 EXPORT_SYMBOL(sk_stop_timer);
2944
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)2945 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2946 {
2947 if (del_timer_sync(timer))
2948 __sock_put(sk);
2949 }
2950 EXPORT_SYMBOL(sk_stop_timer_sync);
2951
sock_init_data(struct socket * sock,struct sock * sk)2952 void sock_init_data(struct socket *sock, struct sock *sk)
2953 {
2954 sk_init_common(sk);
2955 sk->sk_send_head = NULL;
2956
2957 timer_setup(&sk->sk_timer, NULL, 0);
2958
2959 sk->sk_allocation = GFP_KERNEL;
2960 sk->sk_rcvbuf = sysctl_rmem_default;
2961 sk->sk_sndbuf = sysctl_wmem_default;
2962 sk->sk_state = TCP_CLOSE;
2963 sk_set_socket(sk, sock);
2964
2965 sock_set_flag(sk, SOCK_ZAPPED);
2966
2967 if (sock) {
2968 sk->sk_type = sock->type;
2969 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2970 sock->sk = sk;
2971 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2972 } else {
2973 RCU_INIT_POINTER(sk->sk_wq, NULL);
2974 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2975 }
2976
2977 rwlock_init(&sk->sk_callback_lock);
2978 if (sk->sk_kern_sock)
2979 lockdep_set_class_and_name(
2980 &sk->sk_callback_lock,
2981 af_kern_callback_keys + sk->sk_family,
2982 af_family_kern_clock_key_strings[sk->sk_family]);
2983 else
2984 lockdep_set_class_and_name(
2985 &sk->sk_callback_lock,
2986 af_callback_keys + sk->sk_family,
2987 af_family_clock_key_strings[sk->sk_family]);
2988
2989 sk->sk_state_change = sock_def_wakeup;
2990 sk->sk_data_ready = sock_def_readable;
2991 sk->sk_write_space = sock_def_write_space;
2992 sk->sk_error_report = sock_def_error_report;
2993 sk->sk_destruct = sock_def_destruct;
2994
2995 sk->sk_frag.page = NULL;
2996 sk->sk_frag.offset = 0;
2997 sk->sk_peek_off = -1;
2998
2999 sk->sk_peer_pid = NULL;
3000 sk->sk_peer_cred = NULL;
3001 sk->sk_write_pending = 0;
3002 sk->sk_rcvlowat = 1;
3003 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3004 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3005
3006 sk->sk_stamp = SK_DEFAULT_STAMP;
3007 #if BITS_PER_LONG==32
3008 seqlock_init(&sk->sk_stamp_seq);
3009 #endif
3010 atomic_set(&sk->sk_zckey, 0);
3011
3012 #ifdef CONFIG_NET_RX_BUSY_POLL
3013 sk->sk_napi_id = 0;
3014 sk->sk_ll_usec = sysctl_net_busy_read;
3015 #endif
3016
3017 sk->sk_max_pacing_rate = ~0UL;
3018 sk->sk_pacing_rate = ~0UL;
3019 WRITE_ONCE(sk->sk_pacing_shift, 10);
3020 sk->sk_incoming_cpu = -1;
3021
3022 sk_rx_queue_clear(sk);
3023 /*
3024 * Before updating sk_refcnt, we must commit prior changes to memory
3025 * (Documentation/RCU/rculist_nulls.rst for details)
3026 */
3027 smp_wmb();
3028 refcount_set(&sk->sk_refcnt, 1);
3029 atomic_set(&sk->sk_drops, 0);
3030 }
3031 EXPORT_SYMBOL(sock_init_data);
3032
lock_sock_nested(struct sock * sk,int subclass)3033 void lock_sock_nested(struct sock *sk, int subclass)
3034 {
3035 might_sleep();
3036 spin_lock_bh(&sk->sk_lock.slock);
3037 if (sk->sk_lock.owned)
3038 __lock_sock(sk);
3039 sk->sk_lock.owned = 1;
3040 spin_unlock(&sk->sk_lock.slock);
3041 /*
3042 * The sk_lock has mutex_lock() semantics here:
3043 */
3044 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3045 local_bh_enable();
3046 }
3047 EXPORT_SYMBOL(lock_sock_nested);
3048
release_sock(struct sock * sk)3049 void release_sock(struct sock *sk)
3050 {
3051 spin_lock_bh(&sk->sk_lock.slock);
3052 if (sk->sk_backlog.tail)
3053 __release_sock(sk);
3054
3055 /* Warning : release_cb() might need to release sk ownership,
3056 * ie call sock_release_ownership(sk) before us.
3057 */
3058 if (sk->sk_prot->release_cb)
3059 sk->sk_prot->release_cb(sk);
3060
3061 sock_release_ownership(sk);
3062 if (waitqueue_active(&sk->sk_lock.wq))
3063 wake_up(&sk->sk_lock.wq);
3064 spin_unlock_bh(&sk->sk_lock.slock);
3065 }
3066 EXPORT_SYMBOL(release_sock);
3067
3068 /**
3069 * lock_sock_fast - fast version of lock_sock
3070 * @sk: socket
3071 *
3072 * This version should be used for very small section, where process wont block
3073 * return false if fast path is taken:
3074 *
3075 * sk_lock.slock locked, owned = 0, BH disabled
3076 *
3077 * return true if slow path is taken:
3078 *
3079 * sk_lock.slock unlocked, owned = 1, BH enabled
3080 */
lock_sock_fast(struct sock * sk)3081 bool lock_sock_fast(struct sock *sk)
3082 {
3083 might_sleep();
3084 spin_lock_bh(&sk->sk_lock.slock);
3085
3086 if (!sk->sk_lock.owned)
3087 /*
3088 * Note : We must disable BH
3089 */
3090 return false;
3091
3092 __lock_sock(sk);
3093 sk->sk_lock.owned = 1;
3094 spin_unlock(&sk->sk_lock.slock);
3095 /*
3096 * The sk_lock has mutex_lock() semantics here:
3097 */
3098 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3099 local_bh_enable();
3100 return true;
3101 }
3102 EXPORT_SYMBOL(lock_sock_fast);
3103
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3104 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3105 bool timeval, bool time32)
3106 {
3107 struct sock *sk = sock->sk;
3108 struct timespec64 ts;
3109
3110 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3111 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3112 if (ts.tv_sec == -1)
3113 return -ENOENT;
3114 if (ts.tv_sec == 0) {
3115 ktime_t kt = ktime_get_real();
3116 sock_write_timestamp(sk, kt);
3117 ts = ktime_to_timespec64(kt);
3118 }
3119
3120 if (timeval)
3121 ts.tv_nsec /= 1000;
3122
3123 #ifdef CONFIG_COMPAT_32BIT_TIME
3124 if (time32)
3125 return put_old_timespec32(&ts, userstamp);
3126 #endif
3127 #ifdef CONFIG_SPARC64
3128 /* beware of padding in sparc64 timeval */
3129 if (timeval && !in_compat_syscall()) {
3130 struct __kernel_old_timeval __user tv = {
3131 .tv_sec = ts.tv_sec,
3132 .tv_usec = ts.tv_nsec,
3133 };
3134 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3135 return -EFAULT;
3136 return 0;
3137 }
3138 #endif
3139 return put_timespec64(&ts, userstamp);
3140 }
3141 EXPORT_SYMBOL(sock_gettstamp);
3142
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3143 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3144 {
3145 if (!sock_flag(sk, flag)) {
3146 unsigned long previous_flags = sk->sk_flags;
3147
3148 sock_set_flag(sk, flag);
3149 /*
3150 * we just set one of the two flags which require net
3151 * time stamping, but time stamping might have been on
3152 * already because of the other one
3153 */
3154 if (sock_needs_netstamp(sk) &&
3155 !(previous_flags & SK_FLAGS_TIMESTAMP))
3156 net_enable_timestamp();
3157 }
3158 }
3159
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3160 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3161 int level, int type)
3162 {
3163 struct sock_exterr_skb *serr;
3164 struct sk_buff *skb;
3165 int copied, err;
3166
3167 err = -EAGAIN;
3168 skb = sock_dequeue_err_skb(sk);
3169 if (skb == NULL)
3170 goto out;
3171
3172 copied = skb->len;
3173 if (copied > len) {
3174 msg->msg_flags |= MSG_TRUNC;
3175 copied = len;
3176 }
3177 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3178 if (err)
3179 goto out_free_skb;
3180
3181 sock_recv_timestamp(msg, sk, skb);
3182
3183 serr = SKB_EXT_ERR(skb);
3184 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3185
3186 msg->msg_flags |= MSG_ERRQUEUE;
3187 err = copied;
3188
3189 out_free_skb:
3190 kfree_skb(skb);
3191 out:
3192 return err;
3193 }
3194 EXPORT_SYMBOL(sock_recv_errqueue);
3195
3196 /*
3197 * Get a socket option on an socket.
3198 *
3199 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3200 * asynchronous errors should be reported by getsockopt. We assume
3201 * this means if you specify SO_ERROR (otherwise whats the point of it).
3202 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3203 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3204 char __user *optval, int __user *optlen)
3205 {
3206 struct sock *sk = sock->sk;
3207
3208 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3209 }
3210 EXPORT_SYMBOL(sock_common_getsockopt);
3211
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3212 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3213 int flags)
3214 {
3215 struct sock *sk = sock->sk;
3216 int addr_len = 0;
3217 int err;
3218
3219 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3220 flags & ~MSG_DONTWAIT, &addr_len);
3221 if (err >= 0)
3222 msg->msg_namelen = addr_len;
3223 return err;
3224 }
3225 EXPORT_SYMBOL(sock_common_recvmsg);
3226
3227 /*
3228 * Set socket options on an inet socket.
3229 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3230 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3231 sockptr_t optval, unsigned int optlen)
3232 {
3233 struct sock *sk = sock->sk;
3234
3235 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3236 }
3237 EXPORT_SYMBOL(sock_common_setsockopt);
3238
sk_common_release(struct sock * sk)3239 void sk_common_release(struct sock *sk)
3240 {
3241 if (sk->sk_prot->destroy)
3242 sk->sk_prot->destroy(sk);
3243
3244 /*
3245 * Observation: when sk_common_release is called, processes have
3246 * no access to socket. But net still has.
3247 * Step one, detach it from networking:
3248 *
3249 * A. Remove from hash tables.
3250 */
3251
3252 sk->sk_prot->unhash(sk);
3253
3254 /*
3255 * In this point socket cannot receive new packets, but it is possible
3256 * that some packets are in flight because some CPU runs receiver and
3257 * did hash table lookup before we unhashed socket. They will achieve
3258 * receive queue and will be purged by socket destructor.
3259 *
3260 * Also we still have packets pending on receive queue and probably,
3261 * our own packets waiting in device queues. sock_destroy will drain
3262 * receive queue, but transmitted packets will delay socket destruction
3263 * until the last reference will be released.
3264 */
3265
3266 sock_orphan(sk);
3267
3268 xfrm_sk_free_policy(sk);
3269
3270 sk_refcnt_debug_release(sk);
3271
3272 sock_put(sk);
3273 }
3274 EXPORT_SYMBOL(sk_common_release);
3275
sk_get_meminfo(const struct sock * sk,u32 * mem)3276 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3277 {
3278 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3279
3280 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3281 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3282 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3283 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3284 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3285 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3286 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3287 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3288 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3289 }
3290
3291 #ifdef CONFIG_PROC_FS
3292 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3293 struct prot_inuse {
3294 int val[PROTO_INUSE_NR];
3295 };
3296
3297 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3298
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3299 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3300 {
3301 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3302 }
3303 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3304
sock_prot_inuse_get(struct net * net,struct proto * prot)3305 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3306 {
3307 int cpu, idx = prot->inuse_idx;
3308 int res = 0;
3309
3310 for_each_possible_cpu(cpu)
3311 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3312
3313 return res >= 0 ? res : 0;
3314 }
3315 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3316
sock_inuse_add(struct net * net,int val)3317 static void sock_inuse_add(struct net *net, int val)
3318 {
3319 this_cpu_add(*net->core.sock_inuse, val);
3320 }
3321
sock_inuse_get(struct net * net)3322 int sock_inuse_get(struct net *net)
3323 {
3324 int cpu, res = 0;
3325
3326 for_each_possible_cpu(cpu)
3327 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3328
3329 return res;
3330 }
3331
3332 EXPORT_SYMBOL_GPL(sock_inuse_get);
3333
sock_inuse_init_net(struct net * net)3334 static int __net_init sock_inuse_init_net(struct net *net)
3335 {
3336 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3337 if (net->core.prot_inuse == NULL)
3338 return -ENOMEM;
3339
3340 net->core.sock_inuse = alloc_percpu(int);
3341 if (net->core.sock_inuse == NULL)
3342 goto out;
3343
3344 return 0;
3345
3346 out:
3347 free_percpu(net->core.prot_inuse);
3348 return -ENOMEM;
3349 }
3350
sock_inuse_exit_net(struct net * net)3351 static void __net_exit sock_inuse_exit_net(struct net *net)
3352 {
3353 free_percpu(net->core.prot_inuse);
3354 free_percpu(net->core.sock_inuse);
3355 }
3356
3357 static struct pernet_operations net_inuse_ops = {
3358 .init = sock_inuse_init_net,
3359 .exit = sock_inuse_exit_net,
3360 };
3361
net_inuse_init(void)3362 static __init int net_inuse_init(void)
3363 {
3364 if (register_pernet_subsys(&net_inuse_ops))
3365 panic("Cannot initialize net inuse counters");
3366
3367 return 0;
3368 }
3369
3370 core_initcall(net_inuse_init);
3371
assign_proto_idx(struct proto * prot)3372 static int assign_proto_idx(struct proto *prot)
3373 {
3374 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3375
3376 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3377 pr_err("PROTO_INUSE_NR exhausted\n");
3378 return -ENOSPC;
3379 }
3380
3381 set_bit(prot->inuse_idx, proto_inuse_idx);
3382 return 0;
3383 }
3384
release_proto_idx(struct proto * prot)3385 static void release_proto_idx(struct proto *prot)
3386 {
3387 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3388 clear_bit(prot->inuse_idx, proto_inuse_idx);
3389 }
3390 #else
assign_proto_idx(struct proto * prot)3391 static inline int assign_proto_idx(struct proto *prot)
3392 {
3393 return 0;
3394 }
3395
release_proto_idx(struct proto * prot)3396 static inline void release_proto_idx(struct proto *prot)
3397 {
3398 }
3399
sock_inuse_add(struct net * net,int val)3400 static void sock_inuse_add(struct net *net, int val)
3401 {
3402 }
3403 #endif
3404
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3405 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3406 {
3407 if (!twsk_prot)
3408 return;
3409 kfree(twsk_prot->twsk_slab_name);
3410 twsk_prot->twsk_slab_name = NULL;
3411 kmem_cache_destroy(twsk_prot->twsk_slab);
3412 twsk_prot->twsk_slab = NULL;
3413 }
3414
req_prot_cleanup(struct request_sock_ops * rsk_prot)3415 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3416 {
3417 if (!rsk_prot)
3418 return;
3419 kfree(rsk_prot->slab_name);
3420 rsk_prot->slab_name = NULL;
3421 kmem_cache_destroy(rsk_prot->slab);
3422 rsk_prot->slab = NULL;
3423 }
3424
req_prot_init(const struct proto * prot)3425 static int req_prot_init(const struct proto *prot)
3426 {
3427 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3428
3429 if (!rsk_prot)
3430 return 0;
3431
3432 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3433 prot->name);
3434 if (!rsk_prot->slab_name)
3435 return -ENOMEM;
3436
3437 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3438 rsk_prot->obj_size, 0,
3439 SLAB_ACCOUNT | prot->slab_flags,
3440 NULL);
3441
3442 if (!rsk_prot->slab) {
3443 pr_crit("%s: Can't create request sock SLAB cache!\n",
3444 prot->name);
3445 return -ENOMEM;
3446 }
3447 return 0;
3448 }
3449
proto_register(struct proto * prot,int alloc_slab)3450 int proto_register(struct proto *prot, int alloc_slab)
3451 {
3452 int ret = -ENOBUFS;
3453
3454 if (alloc_slab) {
3455 prot->slab = kmem_cache_create_usercopy(prot->name,
3456 prot->obj_size, 0,
3457 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3458 prot->slab_flags,
3459 prot->useroffset, prot->usersize,
3460 NULL);
3461
3462 if (prot->slab == NULL) {
3463 pr_crit("%s: Can't create sock SLAB cache!\n",
3464 prot->name);
3465 goto out;
3466 }
3467
3468 if (req_prot_init(prot))
3469 goto out_free_request_sock_slab;
3470
3471 if (prot->twsk_prot != NULL) {
3472 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3473
3474 if (prot->twsk_prot->twsk_slab_name == NULL)
3475 goto out_free_request_sock_slab;
3476
3477 prot->twsk_prot->twsk_slab =
3478 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3479 prot->twsk_prot->twsk_obj_size,
3480 0,
3481 SLAB_ACCOUNT |
3482 prot->slab_flags,
3483 NULL);
3484 if (prot->twsk_prot->twsk_slab == NULL)
3485 goto out_free_timewait_sock_slab;
3486 }
3487 }
3488
3489 mutex_lock(&proto_list_mutex);
3490 ret = assign_proto_idx(prot);
3491 if (ret) {
3492 mutex_unlock(&proto_list_mutex);
3493 goto out_free_timewait_sock_slab;
3494 }
3495 list_add(&prot->node, &proto_list);
3496 mutex_unlock(&proto_list_mutex);
3497 return ret;
3498
3499 out_free_timewait_sock_slab:
3500 if (alloc_slab && prot->twsk_prot)
3501 tw_prot_cleanup(prot->twsk_prot);
3502 out_free_request_sock_slab:
3503 if (alloc_slab) {
3504 req_prot_cleanup(prot->rsk_prot);
3505
3506 kmem_cache_destroy(prot->slab);
3507 prot->slab = NULL;
3508 }
3509 out:
3510 return ret;
3511 }
3512 EXPORT_SYMBOL(proto_register);
3513
proto_unregister(struct proto * prot)3514 void proto_unregister(struct proto *prot)
3515 {
3516 mutex_lock(&proto_list_mutex);
3517 release_proto_idx(prot);
3518 list_del(&prot->node);
3519 mutex_unlock(&proto_list_mutex);
3520
3521 kmem_cache_destroy(prot->slab);
3522 prot->slab = NULL;
3523
3524 req_prot_cleanup(prot->rsk_prot);
3525 tw_prot_cleanup(prot->twsk_prot);
3526 }
3527 EXPORT_SYMBOL(proto_unregister);
3528
sock_load_diag_module(int family,int protocol)3529 int sock_load_diag_module(int family, int protocol)
3530 {
3531 if (!protocol) {
3532 if (!sock_is_registered(family))
3533 return -ENOENT;
3534
3535 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3536 NETLINK_SOCK_DIAG, family);
3537 }
3538
3539 #ifdef CONFIG_INET
3540 if (family == AF_INET &&
3541 protocol != IPPROTO_RAW &&
3542 protocol < MAX_INET_PROTOS &&
3543 !rcu_access_pointer(inet_protos[protocol]))
3544 return -ENOENT;
3545 #endif
3546
3547 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3548 NETLINK_SOCK_DIAG, family, protocol);
3549 }
3550 EXPORT_SYMBOL(sock_load_diag_module);
3551
3552 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3553 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3554 __acquires(proto_list_mutex)
3555 {
3556 mutex_lock(&proto_list_mutex);
3557 return seq_list_start_head(&proto_list, *pos);
3558 }
3559
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3560 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3561 {
3562 return seq_list_next(v, &proto_list, pos);
3563 }
3564
proto_seq_stop(struct seq_file * seq,void * v)3565 static void proto_seq_stop(struct seq_file *seq, void *v)
3566 __releases(proto_list_mutex)
3567 {
3568 mutex_unlock(&proto_list_mutex);
3569 }
3570
proto_method_implemented(const void * method)3571 static char proto_method_implemented(const void *method)
3572 {
3573 return method == NULL ? 'n' : 'y';
3574 }
sock_prot_memory_allocated(struct proto * proto)3575 static long sock_prot_memory_allocated(struct proto *proto)
3576 {
3577 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3578 }
3579
sock_prot_memory_pressure(struct proto * proto)3580 static const char *sock_prot_memory_pressure(struct proto *proto)
3581 {
3582 return proto->memory_pressure != NULL ?
3583 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3584 }
3585
proto_seq_printf(struct seq_file * seq,struct proto * proto)3586 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3587 {
3588
3589 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3590 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3591 proto->name,
3592 proto->obj_size,
3593 sock_prot_inuse_get(seq_file_net(seq), proto),
3594 sock_prot_memory_allocated(proto),
3595 sock_prot_memory_pressure(proto),
3596 proto->max_header,
3597 proto->slab == NULL ? "no" : "yes",
3598 module_name(proto->owner),
3599 proto_method_implemented(proto->close),
3600 proto_method_implemented(proto->connect),
3601 proto_method_implemented(proto->disconnect),
3602 proto_method_implemented(proto->accept),
3603 proto_method_implemented(proto->ioctl),
3604 proto_method_implemented(proto->init),
3605 proto_method_implemented(proto->destroy),
3606 proto_method_implemented(proto->shutdown),
3607 proto_method_implemented(proto->setsockopt),
3608 proto_method_implemented(proto->getsockopt),
3609 proto_method_implemented(proto->sendmsg),
3610 proto_method_implemented(proto->recvmsg),
3611 proto_method_implemented(proto->sendpage),
3612 proto_method_implemented(proto->bind),
3613 proto_method_implemented(proto->backlog_rcv),
3614 proto_method_implemented(proto->hash),
3615 proto_method_implemented(proto->unhash),
3616 proto_method_implemented(proto->get_port),
3617 proto_method_implemented(proto->enter_memory_pressure));
3618 }
3619
proto_seq_show(struct seq_file * seq,void * v)3620 static int proto_seq_show(struct seq_file *seq, void *v)
3621 {
3622 if (v == &proto_list)
3623 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3624 "protocol",
3625 "size",
3626 "sockets",
3627 "memory",
3628 "press",
3629 "maxhdr",
3630 "slab",
3631 "module",
3632 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3633 else
3634 proto_seq_printf(seq, list_entry(v, struct proto, node));
3635 return 0;
3636 }
3637
3638 static const struct seq_operations proto_seq_ops = {
3639 .start = proto_seq_start,
3640 .next = proto_seq_next,
3641 .stop = proto_seq_stop,
3642 .show = proto_seq_show,
3643 };
3644
proto_init_net(struct net * net)3645 static __net_init int proto_init_net(struct net *net)
3646 {
3647 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3648 sizeof(struct seq_net_private)))
3649 return -ENOMEM;
3650
3651 return 0;
3652 }
3653
proto_exit_net(struct net * net)3654 static __net_exit void proto_exit_net(struct net *net)
3655 {
3656 remove_proc_entry("protocols", net->proc_net);
3657 }
3658
3659
3660 static __net_initdata struct pernet_operations proto_net_ops = {
3661 .init = proto_init_net,
3662 .exit = proto_exit_net,
3663 };
3664
proto_init(void)3665 static int __init proto_init(void)
3666 {
3667 return register_pernet_subsys(&proto_net_ops);
3668 }
3669
3670 subsys_initcall(proto_init);
3671
3672 #endif /* PROC_FS */
3673
3674 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3675 bool sk_busy_loop_end(void *p, unsigned long start_time)
3676 {
3677 struct sock *sk = p;
3678
3679 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3680 sk_busy_loop_timeout(sk, start_time);
3681 }
3682 EXPORT_SYMBOL(sk_busy_loop_end);
3683 #endif /* CONFIG_NET_RX_BUSY_POLL */
3684
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3685 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3686 {
3687 if (!sk->sk_prot->bind_add)
3688 return -EOPNOTSUPP;
3689 return sk->sk_prot->bind_add(sk, addr, addr_len);
3690 }
3691 EXPORT_SYMBOL(sock_bind_add);
3692