1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119
120 #include "scm.h"
121
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125
126 /* SMP locking strategy:
127 * hash table is protected with spinlock.
128 * each socket state is protected by separate spinlock.
129 */
130
unix_unbound_hash(struct sock * sk)131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 unsigned long hash = (unsigned long)sk;
134
135 hash ^= hash >> 16;
136 hash ^= hash >> 8;
137 hash ^= sk->sk_type;
138
139 return hash & UNIX_HASH_MOD;
140 }
141
unix_bsd_hash(struct inode * i)142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 return i->i_ino & UNIX_HASH_MOD;
145 }
146
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 int addr_len, int type)
149 {
150 __wsum csum = csum_partial(sunaddr, addr_len, 0);
151 unsigned int hash;
152
153 hash = (__force unsigned int)csum_fold(csum);
154 hash ^= hash >> 8;
155 hash ^= type;
156
157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)160 static void unix_table_double_lock(struct net *net,
161 unsigned int hash1, unsigned int hash2)
162 {
163 if (hash1 == hash2) {
164 spin_lock(&net->unx.table.locks[hash1]);
165 return;
166 }
167
168 if (hash1 > hash2)
169 swap(hash1, hash2);
170
171 spin_lock(&net->unx.table.locks[hash1]);
172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)175 static void unix_table_double_unlock(struct net *net,
176 unsigned int hash1, unsigned int hash2)
177 {
178 if (hash1 == hash2) {
179 spin_unlock(&net->unx.table.locks[hash1]);
180 return;
181 }
182
183 spin_unlock(&net->unx.table.locks[hash1]);
184 spin_unlock(&net->unx.table.locks[hash2]);
185 }
186
187 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 UNIXCB(skb).secid = scm->secid;
191 }
192
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 scm->secid = UNIXCB(skb).secid;
196 }
197
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216
unix_our_peer(struct sock * sk,struct sock * osk)217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 return unix_peer(osk) == sk;
220 }
221
unix_may_send(struct sock * sk,struct sock * osk)222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226
unix_recvq_full(const struct sock * sk)227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231
unix_recvq_full_lockless(const struct sock * sk)232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 READ_ONCE(sk->sk_max_ack_backlog);
236 }
237
unix_peer_get(struct sock * s)238 struct sock *unix_peer_get(struct sock *s)
239 {
240 struct sock *peer;
241
242 unix_state_lock(s);
243 peer = unix_peer(s);
244 if (peer)
245 sock_hold(peer);
246 unix_state_unlock(s);
247 return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 int addr_len)
253 {
254 struct unix_address *addr;
255
256 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 if (!addr)
258 return NULL;
259
260 refcount_set(&addr->refcnt, 1);
261 addr->len = addr_len;
262 memcpy(addr->name, sunaddr, addr_len);
263
264 return addr;
265 }
266
unix_release_addr(struct unix_address * addr)267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 if (refcount_dec_and_test(&addr->refcnt))
270 kfree(addr);
271 }
272
273 /*
274 * Check unix socket name:
275 * - should be not zero length.
276 * - if started by not zero, should be NULL terminated (FS object)
277 * - if started by zero, it is abstract name.
278 */
279
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 addr_len > sizeof(*sunaddr))
284 return -EINVAL;
285
286 if (sunaddr->sun_family != AF_UNIX)
287 return -EINVAL;
288
289 return 0;
290 }
291
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
295 short offset = offsetof(struct sockaddr_storage, __data);
296
297 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
298
299 /* This may look like an off by one error but it is a bit more
300 * subtle. 108 is the longest valid AF_UNIX path for a binding.
301 * sun_path[108] doesn't as such exist. However in kernel space
302 * we are guaranteed that it is a valid memory location in our
303 * kernel address buffer because syscall functions always pass
304 * a pointer of struct sockaddr_storage which has a bigger buffer
305 * than 108. Also, we must terminate sun_path for strlen() in
306 * getname_kernel().
307 */
308 addr->__data[addr_len - offset] = 0;
309
310 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
311 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
312 * know the actual buffer.
313 */
314 return strlen(addr->__data) + offset + 1;
315 }
316
__unix_remove_socket(struct sock * sk)317 static void __unix_remove_socket(struct sock *sk)
318 {
319 sk_del_node_init(sk);
320 }
321
__unix_insert_socket(struct net * net,struct sock * sk)322 static void __unix_insert_socket(struct net *net, struct sock *sk)
323 {
324 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
325 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
326 }
327
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
329 struct unix_address *addr, unsigned int hash)
330 {
331 __unix_remove_socket(sk);
332 smp_store_release(&unix_sk(sk)->addr, addr);
333
334 sk->sk_hash = hash;
335 __unix_insert_socket(net, sk);
336 }
337
unix_remove_socket(struct net * net,struct sock * sk)338 static void unix_remove_socket(struct net *net, struct sock *sk)
339 {
340 spin_lock(&net->unx.table.locks[sk->sk_hash]);
341 __unix_remove_socket(sk);
342 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
343 }
344
unix_insert_unbound_socket(struct net * net,struct sock * sk)345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
346 {
347 spin_lock(&net->unx.table.locks[sk->sk_hash]);
348 __unix_insert_socket(net, sk);
349 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
350 }
351
unix_insert_bsd_socket(struct sock * sk)352 static void unix_insert_bsd_socket(struct sock *sk)
353 {
354 spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
356 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 }
358
unix_remove_bsd_socket(struct sock * sk)359 static void unix_remove_bsd_socket(struct sock *sk)
360 {
361 if (!hlist_unhashed(&sk->sk_bind_node)) {
362 spin_lock(&bsd_socket_locks[sk->sk_hash]);
363 __sk_del_bind_node(sk);
364 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
365
366 sk_node_init(&sk->sk_bind_node);
367 }
368 }
369
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)370 static struct sock *__unix_find_socket_byname(struct net *net,
371 struct sockaddr_un *sunname,
372 int len, unsigned int hash)
373 {
374 struct sock *s;
375
376 sk_for_each(s, &net->unx.table.buckets[hash]) {
377 struct unix_sock *u = unix_sk(s);
378
379 if (u->addr->len == len &&
380 !memcmp(u->addr->name, sunname, len))
381 return s;
382 }
383 return NULL;
384 }
385
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)386 static inline struct sock *unix_find_socket_byname(struct net *net,
387 struct sockaddr_un *sunname,
388 int len, unsigned int hash)
389 {
390 struct sock *s;
391
392 spin_lock(&net->unx.table.locks[hash]);
393 s = __unix_find_socket_byname(net, sunname, len, hash);
394 if (s)
395 sock_hold(s);
396 spin_unlock(&net->unx.table.locks[hash]);
397 return s;
398 }
399
unix_find_socket_byinode(struct inode * i)400 static struct sock *unix_find_socket_byinode(struct inode *i)
401 {
402 unsigned int hash = unix_bsd_hash(i);
403 struct sock *s;
404
405 spin_lock(&bsd_socket_locks[hash]);
406 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
407 struct dentry *dentry = unix_sk(s)->path.dentry;
408
409 if (dentry && d_backing_inode(dentry) == i) {
410 sock_hold(s);
411 spin_unlock(&bsd_socket_locks[hash]);
412 return s;
413 }
414 }
415 spin_unlock(&bsd_socket_locks[hash]);
416 return NULL;
417 }
418
419 /* Support code for asymmetrically connected dgram sockets
420 *
421 * If a datagram socket is connected to a socket not itself connected
422 * to the first socket (eg, /dev/log), clients may only enqueue more
423 * messages if the present receive queue of the server socket is not
424 * "too large". This means there's a second writeability condition
425 * poll and sendmsg need to test. The dgram recv code will do a wake
426 * up on the peer_wait wait queue of a socket upon reception of a
427 * datagram which needs to be propagated to sleeping would-be writers
428 * since these might not have sent anything so far. This can't be
429 * accomplished via poll_wait because the lifetime of the server
430 * socket might be less than that of its clients if these break their
431 * association with it or if the server socket is closed while clients
432 * are still connected to it and there's no way to inform "a polling
433 * implementation" that it should let go of a certain wait queue
434 *
435 * In order to propagate a wake up, a wait_queue_entry_t of the client
436 * socket is enqueued on the peer_wait queue of the server socket
437 * whose wake function does a wake_up on the ordinary client socket
438 * wait queue. This connection is established whenever a write (or
439 * poll for write) hit the flow control condition and broken when the
440 * association to the server socket is dissolved or after a wake up
441 * was relayed.
442 */
443
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
445 void *key)
446 {
447 struct unix_sock *u;
448 wait_queue_head_t *u_sleep;
449
450 u = container_of(q, struct unix_sock, peer_wake);
451
452 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
453 q);
454 u->peer_wake.private = NULL;
455
456 /* relaying can only happen while the wq still exists */
457 u_sleep = sk_sleep(&u->sk);
458 if (u_sleep)
459 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
460
461 return 0;
462 }
463
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
465 {
466 struct unix_sock *u, *u_other;
467 int rc;
468
469 u = unix_sk(sk);
470 u_other = unix_sk(other);
471 rc = 0;
472 spin_lock(&u_other->peer_wait.lock);
473
474 if (!u->peer_wake.private) {
475 u->peer_wake.private = other;
476 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
477
478 rc = 1;
479 }
480
481 spin_unlock(&u_other->peer_wait.lock);
482 return rc;
483 }
484
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
486 struct sock *other)
487 {
488 struct unix_sock *u, *u_other;
489
490 u = unix_sk(sk);
491 u_other = unix_sk(other);
492 spin_lock(&u_other->peer_wait.lock);
493
494 if (u->peer_wake.private == other) {
495 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
496 u->peer_wake.private = NULL;
497 }
498
499 spin_unlock(&u_other->peer_wait.lock);
500 }
501
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
503 struct sock *other)
504 {
505 unix_dgram_peer_wake_disconnect(sk, other);
506 wake_up_interruptible_poll(sk_sleep(sk),
507 EPOLLOUT |
508 EPOLLWRNORM |
509 EPOLLWRBAND);
510 }
511
512 /* preconditions:
513 * - unix_peer(sk) == other
514 * - association is stable
515 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
517 {
518 int connected;
519
520 connected = unix_dgram_peer_wake_connect(sk, other);
521
522 /* If other is SOCK_DEAD, we want to make sure we signal
523 * POLLOUT, such that a subsequent write() can get a
524 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
525 * to other and its full, we will hang waiting for POLLOUT.
526 */
527 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
528 return 1;
529
530 if (connected)
531 unix_dgram_peer_wake_disconnect(sk, other);
532
533 return 0;
534 }
535
unix_writable(const struct sock * sk)536 static int unix_writable(const struct sock *sk)
537 {
538 return sk->sk_state != TCP_LISTEN &&
539 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
540 }
541
unix_write_space(struct sock * sk)542 static void unix_write_space(struct sock *sk)
543 {
544 struct socket_wq *wq;
545
546 rcu_read_lock();
547 if (unix_writable(sk)) {
548 wq = rcu_dereference(sk->sk_wq);
549 if (skwq_has_sleeper(wq))
550 wake_up_interruptible_sync_poll(&wq->wait,
551 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
552 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
553 }
554 rcu_read_unlock();
555 }
556
557 /* When dgram socket disconnects (or changes its peer), we clear its receive
558 * queue of packets arrived from previous peer. First, it allows to do
559 * flow control based only on wmem_alloc; second, sk connected to peer
560 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
562 {
563 if (!skb_queue_empty(&sk->sk_receive_queue)) {
564 skb_queue_purge(&sk->sk_receive_queue);
565 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
566
567 /* If one link of bidirectional dgram pipe is disconnected,
568 * we signal error. Messages are lost. Do not make this,
569 * when peer was not connected to us.
570 */
571 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
572 WRITE_ONCE(other->sk_err, ECONNRESET);
573 sk_error_report(other);
574 }
575 }
576 other->sk_state = TCP_CLOSE;
577 }
578
unix_sock_destructor(struct sock * sk)579 static void unix_sock_destructor(struct sock *sk)
580 {
581 struct unix_sock *u = unix_sk(sk);
582
583 skb_queue_purge(&sk->sk_receive_queue);
584
585 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
586 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
587 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
588 if (!sock_flag(sk, SOCK_DEAD)) {
589 pr_info("Attempt to release alive unix socket: %p\n", sk);
590 return;
591 }
592
593 if (u->addr)
594 unix_release_addr(u->addr);
595
596 atomic_long_dec(&unix_nr_socks);
597 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
598 #ifdef UNIX_REFCNT_DEBUG
599 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
600 atomic_long_read(&unix_nr_socks));
601 #endif
602 }
603
unix_release_sock(struct sock * sk,int embrion)604 static void unix_release_sock(struct sock *sk, int embrion)
605 {
606 struct unix_sock *u = unix_sk(sk);
607 struct sock *skpair;
608 struct sk_buff *skb;
609 struct path path;
610 int state;
611
612 unix_remove_socket(sock_net(sk), sk);
613 unix_remove_bsd_socket(sk);
614
615 /* Clear state */
616 unix_state_lock(sk);
617 sock_orphan(sk);
618 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
619 path = u->path;
620 u->path.dentry = NULL;
621 u->path.mnt = NULL;
622 state = sk->sk_state;
623 sk->sk_state = TCP_CLOSE;
624
625 skpair = unix_peer(sk);
626 unix_peer(sk) = NULL;
627
628 unix_state_unlock(sk);
629
630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
631 if (u->oob_skb) {
632 kfree_skb(u->oob_skb);
633 u->oob_skb = NULL;
634 }
635 #endif
636
637 wake_up_interruptible_all(&u->peer_wait);
638
639 if (skpair != NULL) {
640 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
641 unix_state_lock(skpair);
642 /* No more writes */
643 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
644 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
645 WRITE_ONCE(skpair->sk_err, ECONNRESET);
646 unix_state_unlock(skpair);
647 skpair->sk_state_change(skpair);
648 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
649 }
650
651 unix_dgram_peer_wake_disconnect(sk, skpair);
652 sock_put(skpair); /* It may now die */
653 }
654
655 /* Try to flush out this socket. Throw out buffers at least */
656
657 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
658 if (state == TCP_LISTEN)
659 unix_release_sock(skb->sk, 1);
660 /* passed fds are erased in the kfree_skb hook */
661 UNIXCB(skb).consumed = skb->len;
662 kfree_skb(skb);
663 }
664
665 if (path.dentry)
666 path_put(&path);
667
668 sock_put(sk);
669
670 /* ---- Socket is dead now and most probably destroyed ---- */
671
672 /*
673 * Fixme: BSD difference: In BSD all sockets connected to us get
674 * ECONNRESET and we die on the spot. In Linux we behave
675 * like files and pipes do and wait for the last
676 * dereference.
677 *
678 * Can't we simply set sock->err?
679 *
680 * What the above comment does talk about? --ANK(980817)
681 */
682
683 if (READ_ONCE(unix_tot_inflight))
684 unix_gc(); /* Garbage collect fds */
685 }
686
init_peercred(struct sock * sk)687 static void init_peercred(struct sock *sk)
688 {
689 const struct cred *old_cred;
690 struct pid *old_pid;
691
692 spin_lock(&sk->sk_peer_lock);
693 old_pid = sk->sk_peer_pid;
694 old_cred = sk->sk_peer_cred;
695 sk->sk_peer_pid = get_pid(task_tgid(current));
696 sk->sk_peer_cred = get_current_cred();
697 spin_unlock(&sk->sk_peer_lock);
698
699 put_pid(old_pid);
700 put_cred(old_cred);
701 }
702
copy_peercred(struct sock * sk,struct sock * peersk)703 static void copy_peercred(struct sock *sk, struct sock *peersk)
704 {
705 const struct cred *old_cred;
706 struct pid *old_pid;
707
708 if (sk < peersk) {
709 spin_lock(&sk->sk_peer_lock);
710 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 } else {
712 spin_lock(&peersk->sk_peer_lock);
713 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
714 }
715 old_pid = sk->sk_peer_pid;
716 old_cred = sk->sk_peer_cred;
717 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
718 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
719
720 spin_unlock(&sk->sk_peer_lock);
721 spin_unlock(&peersk->sk_peer_lock);
722
723 put_pid(old_pid);
724 put_cred(old_cred);
725 }
726
unix_listen(struct socket * sock,int backlog)727 static int unix_listen(struct socket *sock, int backlog)
728 {
729 int err;
730 struct sock *sk = sock->sk;
731 struct unix_sock *u = unix_sk(sk);
732
733 err = -EOPNOTSUPP;
734 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
735 goto out; /* Only stream/seqpacket sockets accept */
736 err = -EINVAL;
737 if (!u->addr)
738 goto out; /* No listens on an unbound socket */
739 unix_state_lock(sk);
740 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
741 goto out_unlock;
742 if (backlog > sk->sk_max_ack_backlog)
743 wake_up_interruptible_all(&u->peer_wait);
744 sk->sk_max_ack_backlog = backlog;
745 sk->sk_state = TCP_LISTEN;
746 /* set credentials so connect can copy them */
747 init_peercred(sk);
748 err = 0;
749
750 out_unlock:
751 unix_state_unlock(sk);
752 out:
753 return err;
754 }
755
756 static int unix_release(struct socket *);
757 static int unix_bind(struct socket *, struct sockaddr *, int);
758 static int unix_stream_connect(struct socket *, struct sockaddr *,
759 int addr_len, int flags);
760 static int unix_socketpair(struct socket *, struct socket *);
761 static int unix_accept(struct socket *, struct socket *, int, bool);
762 static int unix_getname(struct socket *, struct sockaddr *, int);
763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
765 poll_table *);
766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
767 #ifdef CONFIG_COMPAT
768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
769 #endif
770 static int unix_shutdown(struct socket *, int);
771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
773 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
774 struct pipe_inode_info *, size_t size,
775 unsigned int flags);
776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
781 int, int);
782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
784 int);
785
unix_set_peek_off(struct sock * sk,int val)786 static int unix_set_peek_off(struct sock *sk, int val)
787 {
788 struct unix_sock *u = unix_sk(sk);
789
790 if (mutex_lock_interruptible(&u->iolock))
791 return -EINTR;
792
793 WRITE_ONCE(sk->sk_peek_off, val);
794 mutex_unlock(&u->iolock);
795
796 return 0;
797 }
798
799 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)800 static int unix_count_nr_fds(struct sock *sk)
801 {
802 struct sk_buff *skb;
803 struct unix_sock *u;
804 int nr_fds = 0;
805
806 spin_lock(&sk->sk_receive_queue.lock);
807 skb = skb_peek(&sk->sk_receive_queue);
808 while (skb) {
809 u = unix_sk(skb->sk);
810 nr_fds += atomic_read(&u->scm_stat.nr_fds);
811 skb = skb_peek_next(skb, &sk->sk_receive_queue);
812 }
813 spin_unlock(&sk->sk_receive_queue.lock);
814
815 return nr_fds;
816 }
817
unix_show_fdinfo(struct seq_file * m,struct socket * sock)818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
819 {
820 struct sock *sk = sock->sk;
821 unsigned char s_state;
822 struct unix_sock *u;
823 int nr_fds = 0;
824
825 if (sk) {
826 s_state = READ_ONCE(sk->sk_state);
827 u = unix_sk(sk);
828
829 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
830 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
831 * SOCK_DGRAM is ordinary. So, no lock is needed.
832 */
833 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
834 nr_fds = atomic_read(&u->scm_stat.nr_fds);
835 else if (s_state == TCP_LISTEN)
836 nr_fds = unix_count_nr_fds(sk);
837
838 seq_printf(m, "scm_fds: %u\n", nr_fds);
839 }
840 }
841 #else
842 #define unix_show_fdinfo NULL
843 #endif
844
845 static const struct proto_ops unix_stream_ops = {
846 .family = PF_UNIX,
847 .owner = THIS_MODULE,
848 .release = unix_release,
849 .bind = unix_bind,
850 .connect = unix_stream_connect,
851 .socketpair = unix_socketpair,
852 .accept = unix_accept,
853 .getname = unix_getname,
854 .poll = unix_poll,
855 .ioctl = unix_ioctl,
856 #ifdef CONFIG_COMPAT
857 .compat_ioctl = unix_compat_ioctl,
858 #endif
859 .listen = unix_listen,
860 .shutdown = unix_shutdown,
861 .sendmsg = unix_stream_sendmsg,
862 .recvmsg = unix_stream_recvmsg,
863 .read_skb = unix_stream_read_skb,
864 .mmap = sock_no_mmap,
865 .splice_read = unix_stream_splice_read,
866 .set_peek_off = unix_set_peek_off,
867 .show_fdinfo = unix_show_fdinfo,
868 };
869
870 static const struct proto_ops unix_dgram_ops = {
871 .family = PF_UNIX,
872 .owner = THIS_MODULE,
873 .release = unix_release,
874 .bind = unix_bind,
875 .connect = unix_dgram_connect,
876 .socketpair = unix_socketpair,
877 .accept = sock_no_accept,
878 .getname = unix_getname,
879 .poll = unix_dgram_poll,
880 .ioctl = unix_ioctl,
881 #ifdef CONFIG_COMPAT
882 .compat_ioctl = unix_compat_ioctl,
883 #endif
884 .listen = sock_no_listen,
885 .shutdown = unix_shutdown,
886 .sendmsg = unix_dgram_sendmsg,
887 .read_skb = unix_read_skb,
888 .recvmsg = unix_dgram_recvmsg,
889 .mmap = sock_no_mmap,
890 .set_peek_off = unix_set_peek_off,
891 .show_fdinfo = unix_show_fdinfo,
892 };
893
894 static const struct proto_ops unix_seqpacket_ops = {
895 .family = PF_UNIX,
896 .owner = THIS_MODULE,
897 .release = unix_release,
898 .bind = unix_bind,
899 .connect = unix_stream_connect,
900 .socketpair = unix_socketpair,
901 .accept = unix_accept,
902 .getname = unix_getname,
903 .poll = unix_dgram_poll,
904 .ioctl = unix_ioctl,
905 #ifdef CONFIG_COMPAT
906 .compat_ioctl = unix_compat_ioctl,
907 #endif
908 .listen = unix_listen,
909 .shutdown = unix_shutdown,
910 .sendmsg = unix_seqpacket_sendmsg,
911 .recvmsg = unix_seqpacket_recvmsg,
912 .mmap = sock_no_mmap,
913 .set_peek_off = unix_set_peek_off,
914 .show_fdinfo = unix_show_fdinfo,
915 };
916
unix_close(struct sock * sk,long timeout)917 static void unix_close(struct sock *sk, long timeout)
918 {
919 /* Nothing to do here, unix socket does not need a ->close().
920 * This is merely for sockmap.
921 */
922 }
923
unix_unhash(struct sock * sk)924 static void unix_unhash(struct sock *sk)
925 {
926 /* Nothing to do here, unix socket does not need a ->unhash().
927 * This is merely for sockmap.
928 */
929 }
930
unix_bpf_bypass_getsockopt(int level,int optname)931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
932 {
933 if (level == SOL_SOCKET) {
934 switch (optname) {
935 case SO_PEERPIDFD:
936 return true;
937 default:
938 return false;
939 }
940 }
941
942 return false;
943 }
944
945 struct proto unix_dgram_proto = {
946 .name = "UNIX",
947 .owner = THIS_MODULE,
948 .obj_size = sizeof(struct unix_sock),
949 .close = unix_close,
950 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
951 #ifdef CONFIG_BPF_SYSCALL
952 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
953 #endif
954 };
955
956 struct proto unix_stream_proto = {
957 .name = "UNIX-STREAM",
958 .owner = THIS_MODULE,
959 .obj_size = sizeof(struct unix_sock),
960 .close = unix_close,
961 .unhash = unix_unhash,
962 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
963 #ifdef CONFIG_BPF_SYSCALL
964 .psock_update_sk_prot = unix_stream_bpf_update_proto,
965 #endif
966 };
967
unix_create1(struct net * net,struct socket * sock,int kern,int type)968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
969 {
970 struct unix_sock *u;
971 struct sock *sk;
972 int err;
973
974 atomic_long_inc(&unix_nr_socks);
975 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
976 err = -ENFILE;
977 goto err;
978 }
979
980 if (type == SOCK_STREAM)
981 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
982 else /*dgram and seqpacket */
983 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
984
985 if (!sk) {
986 err = -ENOMEM;
987 goto err;
988 }
989
990 sock_init_data(sock, sk);
991
992 sk->sk_hash = unix_unbound_hash(sk);
993 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
994 sk->sk_write_space = unix_write_space;
995 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
996 sk->sk_destruct = unix_sock_destructor;
997 u = unix_sk(sk);
998 u->path.dentry = NULL;
999 u->path.mnt = NULL;
1000 spin_lock_init(&u->lock);
1001 atomic_long_set(&u->inflight, 0);
1002 INIT_LIST_HEAD(&u->link);
1003 mutex_init(&u->iolock); /* single task reading lock */
1004 mutex_init(&u->bindlock); /* single task binding lock */
1005 init_waitqueue_head(&u->peer_wait);
1006 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008 unix_insert_unbound_socket(net, sk);
1009
1010 sock_prot_inuse_add(net, sk->sk_prot, 1);
1011
1012 return sk;
1013
1014 err:
1015 atomic_long_dec(&unix_nr_socks);
1016 return ERR_PTR(err);
1017 }
1018
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1020 int kern)
1021 {
1022 struct sock *sk;
1023
1024 if (protocol && protocol != PF_UNIX)
1025 return -EPROTONOSUPPORT;
1026
1027 sock->state = SS_UNCONNECTED;
1028
1029 switch (sock->type) {
1030 case SOCK_STREAM:
1031 sock->ops = &unix_stream_ops;
1032 break;
1033 /*
1034 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1035 * nothing uses it.
1036 */
1037 case SOCK_RAW:
1038 sock->type = SOCK_DGRAM;
1039 fallthrough;
1040 case SOCK_DGRAM:
1041 sock->ops = &unix_dgram_ops;
1042 break;
1043 case SOCK_SEQPACKET:
1044 sock->ops = &unix_seqpacket_ops;
1045 break;
1046 default:
1047 return -ESOCKTNOSUPPORT;
1048 }
1049
1050 sk = unix_create1(net, sock, kern, sock->type);
1051 if (IS_ERR(sk))
1052 return PTR_ERR(sk);
1053
1054 return 0;
1055 }
1056
unix_release(struct socket * sock)1057 static int unix_release(struct socket *sock)
1058 {
1059 struct sock *sk = sock->sk;
1060
1061 if (!sk)
1062 return 0;
1063
1064 sk->sk_prot->close(sk, 0);
1065 unix_release_sock(sk, 0);
1066 sock->sk = NULL;
1067
1068 return 0;
1069 }
1070
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1072 int type)
1073 {
1074 struct inode *inode;
1075 struct path path;
1076 struct sock *sk;
1077 int err;
1078
1079 unix_mkname_bsd(sunaddr, addr_len);
1080 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1081 if (err)
1082 goto fail;
1083
1084 err = path_permission(&path, MAY_WRITE);
1085 if (err)
1086 goto path_put;
1087
1088 err = -ECONNREFUSED;
1089 inode = d_backing_inode(path.dentry);
1090 if (!S_ISSOCK(inode->i_mode))
1091 goto path_put;
1092
1093 sk = unix_find_socket_byinode(inode);
1094 if (!sk)
1095 goto path_put;
1096
1097 err = -EPROTOTYPE;
1098 if (sk->sk_type == type)
1099 touch_atime(&path);
1100 else
1101 goto sock_put;
1102
1103 path_put(&path);
1104
1105 return sk;
1106
1107 sock_put:
1108 sock_put(sk);
1109 path_put:
1110 path_put(&path);
1111 fail:
1112 return ERR_PTR(err);
1113 }
1114
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1115 static struct sock *unix_find_abstract(struct net *net,
1116 struct sockaddr_un *sunaddr,
1117 int addr_len, int type)
1118 {
1119 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120 struct dentry *dentry;
1121 struct sock *sk;
1122
1123 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1124 if (!sk)
1125 return ERR_PTR(-ECONNREFUSED);
1126
1127 dentry = unix_sk(sk)->path.dentry;
1128 if (dentry)
1129 touch_atime(&unix_sk(sk)->path);
1130
1131 return sk;
1132 }
1133
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1134 static struct sock *unix_find_other(struct net *net,
1135 struct sockaddr_un *sunaddr,
1136 int addr_len, int type)
1137 {
1138 struct sock *sk;
1139
1140 if (sunaddr->sun_path[0])
1141 sk = unix_find_bsd(sunaddr, addr_len, type);
1142 else
1143 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1144
1145 return sk;
1146 }
1147
unix_autobind(struct sock * sk)1148 static int unix_autobind(struct sock *sk)
1149 {
1150 unsigned int new_hash, old_hash = sk->sk_hash;
1151 struct unix_sock *u = unix_sk(sk);
1152 struct net *net = sock_net(sk);
1153 struct unix_address *addr;
1154 u32 lastnum, ordernum;
1155 int err;
1156
1157 err = mutex_lock_interruptible(&u->bindlock);
1158 if (err)
1159 return err;
1160
1161 if (u->addr)
1162 goto out;
1163
1164 err = -ENOMEM;
1165 addr = kzalloc(sizeof(*addr) +
1166 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1167 if (!addr)
1168 goto out;
1169
1170 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171 addr->name->sun_family = AF_UNIX;
1172 refcount_set(&addr->refcnt, 1);
1173
1174 ordernum = get_random_u32();
1175 lastnum = ordernum & 0xFFFFF;
1176 retry:
1177 ordernum = (ordernum + 1) & 0xFFFFF;
1178 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1179
1180 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181 unix_table_double_lock(net, old_hash, new_hash);
1182
1183 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184 unix_table_double_unlock(net, old_hash, new_hash);
1185
1186 /* __unix_find_socket_byname() may take long time if many names
1187 * are already in use.
1188 */
1189 cond_resched();
1190
1191 if (ordernum == lastnum) {
1192 /* Give up if all names seems to be in use. */
1193 err = -ENOSPC;
1194 unix_release_addr(addr);
1195 goto out;
1196 }
1197
1198 goto retry;
1199 }
1200
1201 __unix_set_addr_hash(net, sk, addr, new_hash);
1202 unix_table_double_unlock(net, old_hash, new_hash);
1203 err = 0;
1204
1205 out: mutex_unlock(&u->bindlock);
1206 return err;
1207 }
1208
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1210 int addr_len)
1211 {
1212 umode_t mode = S_IFSOCK |
1213 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214 unsigned int new_hash, old_hash = sk->sk_hash;
1215 struct unix_sock *u = unix_sk(sk);
1216 struct net *net = sock_net(sk);
1217 struct mnt_idmap *idmap;
1218 struct unix_address *addr;
1219 struct dentry *dentry;
1220 struct path parent;
1221 int err;
1222
1223 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224 addr = unix_create_addr(sunaddr, addr_len);
1225 if (!addr)
1226 return -ENOMEM;
1227
1228 /*
1229 * Get the parent directory, calculate the hash for last
1230 * component.
1231 */
1232 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233 if (IS_ERR(dentry)) {
1234 err = PTR_ERR(dentry);
1235 goto out;
1236 }
1237
1238 /*
1239 * All right, let's create it.
1240 */
1241 idmap = mnt_idmap(parent.mnt);
1242 err = security_path_mknod(&parent, dentry, mode, 0);
1243 if (!err)
1244 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1245 if (err)
1246 goto out_path;
1247 err = mutex_lock_interruptible(&u->bindlock);
1248 if (err)
1249 goto out_unlink;
1250 if (u->addr)
1251 goto out_unlock;
1252
1253 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254 unix_table_double_lock(net, old_hash, new_hash);
1255 u->path.mnt = mntget(parent.mnt);
1256 u->path.dentry = dget(dentry);
1257 __unix_set_addr_hash(net, sk, addr, new_hash);
1258 unix_table_double_unlock(net, old_hash, new_hash);
1259 unix_insert_bsd_socket(sk);
1260 mutex_unlock(&u->bindlock);
1261 done_path_create(&parent, dentry);
1262 return 0;
1263
1264 out_unlock:
1265 mutex_unlock(&u->bindlock);
1266 err = -EINVAL;
1267 out_unlink:
1268 /* failed after successful mknod? unlink what we'd created... */
1269 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271 done_path_create(&parent, dentry);
1272 out:
1273 unix_release_addr(addr);
1274 return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278 int addr_len)
1279 {
1280 unsigned int new_hash, old_hash = sk->sk_hash;
1281 struct unix_sock *u = unix_sk(sk);
1282 struct net *net = sock_net(sk);
1283 struct unix_address *addr;
1284 int err;
1285
1286 addr = unix_create_addr(sunaddr, addr_len);
1287 if (!addr)
1288 return -ENOMEM;
1289
1290 err = mutex_lock_interruptible(&u->bindlock);
1291 if (err)
1292 goto out;
1293
1294 if (u->addr) {
1295 err = -EINVAL;
1296 goto out_mutex;
1297 }
1298
1299 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300 unix_table_double_lock(net, old_hash, new_hash);
1301
1302 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1303 goto out_spin;
1304
1305 __unix_set_addr_hash(net, sk, addr, new_hash);
1306 unix_table_double_unlock(net, old_hash, new_hash);
1307 mutex_unlock(&u->bindlock);
1308 return 0;
1309
1310 out_spin:
1311 unix_table_double_unlock(net, old_hash, new_hash);
1312 err = -EADDRINUSE;
1313 out_mutex:
1314 mutex_unlock(&u->bindlock);
1315 out:
1316 unix_release_addr(addr);
1317 return err;
1318 }
1319
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1321 {
1322 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323 struct sock *sk = sock->sk;
1324 int err;
1325
1326 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327 sunaddr->sun_family == AF_UNIX)
1328 return unix_autobind(sk);
1329
1330 err = unix_validate_addr(sunaddr, addr_len);
1331 if (err)
1332 return err;
1333
1334 if (sunaddr->sun_path[0])
1335 err = unix_bind_bsd(sk, sunaddr, addr_len);
1336 else
1337 err = unix_bind_abstract(sk, sunaddr, addr_len);
1338
1339 return err;
1340 }
1341
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1343 {
1344 if (unlikely(sk1 == sk2) || !sk2) {
1345 unix_state_lock(sk1);
1346 return;
1347 }
1348 if (sk1 < sk2) {
1349 unix_state_lock(sk1);
1350 unix_state_lock_nested(sk2);
1351 } else {
1352 unix_state_lock(sk2);
1353 unix_state_lock_nested(sk1);
1354 }
1355 }
1356
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1358 {
1359 if (unlikely(sk1 == sk2) || !sk2) {
1360 unix_state_unlock(sk1);
1361 return;
1362 }
1363 unix_state_unlock(sk1);
1364 unix_state_unlock(sk2);
1365 }
1366
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368 int alen, int flags)
1369 {
1370 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371 struct sock *sk = sock->sk;
1372 struct sock *other;
1373 int err;
1374
1375 err = -EINVAL;
1376 if (alen < offsetofend(struct sockaddr, sa_family))
1377 goto out;
1378
1379 if (addr->sa_family != AF_UNSPEC) {
1380 err = unix_validate_addr(sunaddr, alen);
1381 if (err)
1382 goto out;
1383
1384 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386 !unix_sk(sk)->addr) {
1387 err = unix_autobind(sk);
1388 if (err)
1389 goto out;
1390 }
1391
1392 restart:
1393 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394 if (IS_ERR(other)) {
1395 err = PTR_ERR(other);
1396 goto out;
1397 }
1398
1399 unix_state_double_lock(sk, other);
1400
1401 /* Apparently VFS overslept socket death. Retry. */
1402 if (sock_flag(other, SOCK_DEAD)) {
1403 unix_state_double_unlock(sk, other);
1404 sock_put(other);
1405 goto restart;
1406 }
1407
1408 err = -EPERM;
1409 if (!unix_may_send(sk, other))
1410 goto out_unlock;
1411
1412 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1413 if (err)
1414 goto out_unlock;
1415
1416 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1417 } else {
1418 /*
1419 * 1003.1g breaking connected state with AF_UNSPEC
1420 */
1421 other = NULL;
1422 unix_state_double_lock(sk, other);
1423 }
1424
1425 /*
1426 * If it was connected, reconnect.
1427 */
1428 if (unix_peer(sk)) {
1429 struct sock *old_peer = unix_peer(sk);
1430
1431 unix_peer(sk) = other;
1432 if (!other)
1433 sk->sk_state = TCP_CLOSE;
1434 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1435
1436 unix_state_double_unlock(sk, other);
1437
1438 if (other != old_peer)
1439 unix_dgram_disconnected(sk, old_peer);
1440 sock_put(old_peer);
1441 } else {
1442 unix_peer(sk) = other;
1443 unix_state_double_unlock(sk, other);
1444 }
1445
1446 return 0;
1447
1448 out_unlock:
1449 unix_state_double_unlock(sk, other);
1450 sock_put(other);
1451 out:
1452 return err;
1453 }
1454
unix_wait_for_peer(struct sock * other,long timeo)1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456 __releases(&unix_sk(other)->lock)
1457 {
1458 struct unix_sock *u = unix_sk(other);
1459 int sched;
1460 DEFINE_WAIT(wait);
1461
1462 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1463
1464 sched = !sock_flag(other, SOCK_DEAD) &&
1465 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1466 unix_recvq_full_lockless(other);
1467
1468 unix_state_unlock(other);
1469
1470 if (sched)
1471 timeo = schedule_timeout(timeo);
1472
1473 finish_wait(&u->peer_wait, &wait);
1474 return timeo;
1475 }
1476
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478 int addr_len, int flags)
1479 {
1480 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483 struct net *net = sock_net(sk);
1484 struct sk_buff *skb = NULL;
1485 long timeo;
1486 int err;
1487 int st;
1488
1489 err = unix_validate_addr(sunaddr, addr_len);
1490 if (err)
1491 goto out;
1492
1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495 err = unix_autobind(sk);
1496 if (err)
1497 goto out;
1498 }
1499
1500 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501
1502 /* First of all allocate resources.
1503 If we will make it after state is locked,
1504 we will have to recheck all again in any case.
1505 */
1506
1507 /* create new sock for complete connection */
1508 newsk = unix_create1(net, NULL, 0, sock->type);
1509 if (IS_ERR(newsk)) {
1510 err = PTR_ERR(newsk);
1511 newsk = NULL;
1512 goto out;
1513 }
1514
1515 err = -ENOMEM;
1516
1517 /* Allocate skb for sending to listening sock */
1518 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519 if (skb == NULL)
1520 goto out;
1521
1522 restart:
1523 /* Find listening sock. */
1524 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525 if (IS_ERR(other)) {
1526 err = PTR_ERR(other);
1527 other = NULL;
1528 goto out;
1529 }
1530
1531 /* Latch state of peer */
1532 unix_state_lock(other);
1533
1534 /* Apparently VFS overslept socket death. Retry. */
1535 if (sock_flag(other, SOCK_DEAD)) {
1536 unix_state_unlock(other);
1537 sock_put(other);
1538 goto restart;
1539 }
1540
1541 err = -ECONNREFUSED;
1542 if (other->sk_state != TCP_LISTEN)
1543 goto out_unlock;
1544 if (other->sk_shutdown & RCV_SHUTDOWN)
1545 goto out_unlock;
1546
1547 if (unix_recvq_full(other)) {
1548 err = -EAGAIN;
1549 if (!timeo)
1550 goto out_unlock;
1551
1552 timeo = unix_wait_for_peer(other, timeo);
1553
1554 err = sock_intr_errno(timeo);
1555 if (signal_pending(current))
1556 goto out;
1557 sock_put(other);
1558 goto restart;
1559 }
1560
1561 /* Latch our state.
1562
1563 It is tricky place. We need to grab our state lock and cannot
1564 drop lock on peer. It is dangerous because deadlock is
1565 possible. Connect to self case and simultaneous
1566 attempt to connect are eliminated by checking socket
1567 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568 check this before attempt to grab lock.
1569
1570 Well, and we have to recheck the state after socket locked.
1571 */
1572 st = sk->sk_state;
1573
1574 switch (st) {
1575 case TCP_CLOSE:
1576 /* This is ok... continue with connect */
1577 break;
1578 case TCP_ESTABLISHED:
1579 /* Socket is already connected */
1580 err = -EISCONN;
1581 goto out_unlock;
1582 default:
1583 err = -EINVAL;
1584 goto out_unlock;
1585 }
1586
1587 unix_state_lock_nested(sk);
1588
1589 if (sk->sk_state != st) {
1590 unix_state_unlock(sk);
1591 unix_state_unlock(other);
1592 sock_put(other);
1593 goto restart;
1594 }
1595
1596 err = security_unix_stream_connect(sk, other, newsk);
1597 if (err) {
1598 unix_state_unlock(sk);
1599 goto out_unlock;
1600 }
1601
1602 /* The way is open! Fastly set all the necessary fields... */
1603
1604 sock_hold(sk);
1605 unix_peer(newsk) = sk;
1606 newsk->sk_state = TCP_ESTABLISHED;
1607 newsk->sk_type = sk->sk_type;
1608 init_peercred(newsk);
1609 newu = unix_sk(newsk);
1610 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 otheru = unix_sk(other);
1612
1613 /* copy address information from listening to new sock
1614 *
1615 * The contents of *(otheru->addr) and otheru->path
1616 * are seen fully set up here, since we have found
1617 * otheru in hash under its lock. Insertion into the
1618 * hash chain we'd found it in had been done in an
1619 * earlier critical area protected by the chain's lock,
1620 * the same one where we'd set *(otheru->addr) contents,
1621 * as well as otheru->path and otheru->addr itself.
1622 *
1623 * Using smp_store_release() here to set newu->addr
1624 * is enough to make those stores, as well as stores
1625 * to newu->path visible to anyone who gets newu->addr
1626 * by smp_load_acquire(). IOW, the same warranties
1627 * as for unix_sock instances bound in unix_bind() or
1628 * in unix_autobind().
1629 */
1630 if (otheru->path.dentry) {
1631 path_get(&otheru->path);
1632 newu->path = otheru->path;
1633 }
1634 refcount_inc(&otheru->addr->refcnt);
1635 smp_store_release(&newu->addr, otheru->addr);
1636
1637 /* Set credentials */
1638 copy_peercred(sk, other);
1639
1640 sock->state = SS_CONNECTED;
1641 sk->sk_state = TCP_ESTABLISHED;
1642 sock_hold(newsk);
1643
1644 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1645 unix_peer(sk) = newsk;
1646
1647 unix_state_unlock(sk);
1648
1649 /* take ten and send info to listening sock */
1650 spin_lock(&other->sk_receive_queue.lock);
1651 __skb_queue_tail(&other->sk_receive_queue, skb);
1652 spin_unlock(&other->sk_receive_queue.lock);
1653 unix_state_unlock(other);
1654 other->sk_data_ready(other);
1655 sock_put(other);
1656 return 0;
1657
1658 out_unlock:
1659 if (other)
1660 unix_state_unlock(other);
1661
1662 out:
1663 kfree_skb(skb);
1664 if (newsk)
1665 unix_release_sock(newsk, 0);
1666 if (other)
1667 sock_put(other);
1668 return err;
1669 }
1670
unix_socketpair(struct socket * socka,struct socket * sockb)1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673 struct sock *ska = socka->sk, *skb = sockb->sk;
1674
1675 /* Join our sockets back to back */
1676 sock_hold(ska);
1677 sock_hold(skb);
1678 unix_peer(ska) = skb;
1679 unix_peer(skb) = ska;
1680 init_peercred(ska);
1681 init_peercred(skb);
1682
1683 ska->sk_state = TCP_ESTABLISHED;
1684 skb->sk_state = TCP_ESTABLISHED;
1685 socka->state = SS_CONNECTED;
1686 sockb->state = SS_CONNECTED;
1687 return 0;
1688 }
1689
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1690 static void unix_sock_inherit_flags(const struct socket *old,
1691 struct socket *new)
1692 {
1693 if (test_bit(SOCK_PASSCRED, &old->flags))
1694 set_bit(SOCK_PASSCRED, &new->flags);
1695 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 set_bit(SOCK_PASSPIDFD, &new->flags);
1697 if (test_bit(SOCK_PASSSEC, &old->flags))
1698 set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700
unix_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702 bool kern)
1703 {
1704 struct sock *sk = sock->sk;
1705 struct sock *tsk;
1706 struct sk_buff *skb;
1707 int err;
1708
1709 err = -EOPNOTSUPP;
1710 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711 goto out;
1712
1713 err = -EINVAL;
1714 if (sk->sk_state != TCP_LISTEN)
1715 goto out;
1716
1717 /* If socket state is TCP_LISTEN it cannot change (for now...),
1718 * so that no locks are necessary.
1719 */
1720
1721 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722 &err);
1723 if (!skb) {
1724 /* This means receive shutdown. */
1725 if (err == 0)
1726 err = -EINVAL;
1727 goto out;
1728 }
1729
1730 tsk = skb->sk;
1731 skb_free_datagram(sk, skb);
1732 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733
1734 /* attach accepted sock to socket */
1735 unix_state_lock(tsk);
1736 newsock->state = SS_CONNECTED;
1737 unix_sock_inherit_flags(sock, newsock);
1738 sock_graft(tsk, newsock);
1739 unix_state_unlock(tsk);
1740 return 0;
1741
1742 out:
1743 return err;
1744 }
1745
1746
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749 struct sock *sk = sock->sk;
1750 struct unix_address *addr;
1751 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752 int err = 0;
1753
1754 if (peer) {
1755 sk = unix_peer_get(sk);
1756
1757 err = -ENOTCONN;
1758 if (!sk)
1759 goto out;
1760 err = 0;
1761 } else {
1762 sock_hold(sk);
1763 }
1764
1765 addr = smp_load_acquire(&unix_sk(sk)->addr);
1766 if (!addr) {
1767 sunaddr->sun_family = AF_UNIX;
1768 sunaddr->sun_path[0] = 0;
1769 err = offsetof(struct sockaddr_un, sun_path);
1770 } else {
1771 err = addr->len;
1772 memcpy(sunaddr, addr->name, addr->len);
1773 }
1774 sock_put(sk);
1775 out:
1776 return err;
1777 }
1778
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782
1783 /*
1784 * Garbage collection of unix sockets starts by selecting a set of
1785 * candidate sockets which have reference only from being in flight
1786 * (total_refs == inflight_refs). This condition is checked once during
1787 * the candidate collection phase, and candidates are marked as such, so
1788 * that non-candidates can later be ignored. While inflight_refs is
1789 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790 * is an instantaneous decision.
1791 *
1792 * Once a candidate, however, the socket must not be reinstalled into a
1793 * file descriptor while the garbage collection is in progress.
1794 *
1795 * If the above conditions are met, then the directed graph of
1796 * candidates (*) does not change while unix_gc_lock is held.
1797 *
1798 * Any operations that changes the file count through file descriptors
1799 * (dup, close, sendmsg) does not change the graph since candidates are
1800 * not installed in fds.
1801 *
1802 * Dequeing a candidate via recvmsg would install it into an fd, but
1803 * that takes unix_gc_lock to decrement the inflight count, so it's
1804 * serialized with garbage collection.
1805 *
1806 * MSG_PEEK is special in that it does not change the inflight count,
1807 * yet does install the socket into an fd. The following lock/unlock
1808 * pair is to ensure serialization with garbage collection. It must be
1809 * done between incrementing the file count and installing the file into
1810 * an fd.
1811 *
1812 * If garbage collection starts after the barrier provided by the
1813 * lock/unlock, then it will see the elevated refcount and not mark this
1814 * as a candidate. If a garbage collection is already in progress
1815 * before the file count was incremented, then the lock/unlock pair will
1816 * ensure that garbage collection is finished before progressing to
1817 * installing the fd.
1818 *
1819 * (*) A -> B where B is on the queue of A or B is on the queue of C
1820 * which is on the queue of listening socket A.
1821 */
1822 spin_lock(&unix_gc_lock);
1823 spin_unlock(&unix_gc_lock);
1824 }
1825
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828 int err = 0;
1829
1830 UNIXCB(skb).pid = get_pid(scm->pid);
1831 UNIXCB(skb).uid = scm->creds.uid;
1832 UNIXCB(skb).gid = scm->creds.gid;
1833 UNIXCB(skb).fp = NULL;
1834 unix_get_secdata(scm, skb);
1835 if (scm->fp && send_fds)
1836 err = unix_attach_fds(scm, skb);
1837
1838 skb->destructor = unix_destruct_scm;
1839 return err;
1840 }
1841
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1842 static bool unix_passcred_enabled(const struct socket *sock,
1843 const struct sock *other)
1844 {
1845 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847 !other->sk_socket ||
1848 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851
1852 /*
1853 * Some apps rely on write() giving SCM_CREDENTIALS
1854 * We include credentials if source or destination socket
1855 * asserted SOCK_PASSCRED.
1856 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858 const struct sock *other)
1859 {
1860 if (UNIXCB(skb).pid)
1861 return;
1862 if (unix_passcred_enabled(sock, other)) {
1863 UNIXCB(skb).pid = get_pid(task_tgid(current));
1864 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865 }
1866 }
1867
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869 struct scm_cookie *scm)
1870 {
1871 return UNIXCB(skb).pid == scm->pid &&
1872 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874 unix_secdata_eq(scm, skb);
1875 }
1876
scm_stat_add(struct sock * sk,struct sk_buff * skb)1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879 struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 struct unix_sock *u = unix_sk(sk);
1881
1882 if (unlikely(fp && fp->count))
1883 atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885
scm_stat_del(struct sock * sk,struct sk_buff * skb)1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888 struct scm_fp_list *fp = UNIXCB(skb).fp;
1889 struct unix_sock *u = unix_sk(sk);
1890
1891 if (unlikely(fp && fp->count))
1892 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894
1895 /*
1896 * Send AF_UNIX data.
1897 */
1898
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900 size_t len)
1901 {
1902 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 struct sock *sk = sock->sk, *other = NULL;
1904 struct unix_sock *u = unix_sk(sk);
1905 struct scm_cookie scm;
1906 struct sk_buff *skb;
1907 int data_len = 0;
1908 int sk_locked;
1909 long timeo;
1910 int err;
1911
1912 wait_for_unix_gc();
1913 err = scm_send(sock, msg, &scm, false);
1914 if (err < 0)
1915 return err;
1916
1917 err = -EOPNOTSUPP;
1918 if (msg->msg_flags&MSG_OOB)
1919 goto out;
1920
1921 if (msg->msg_namelen) {
1922 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923 if (err)
1924 goto out;
1925 } else {
1926 sunaddr = NULL;
1927 err = -ENOTCONN;
1928 other = unix_peer_get(sk);
1929 if (!other)
1930 goto out;
1931 }
1932
1933 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935 err = unix_autobind(sk);
1936 if (err)
1937 goto out;
1938 }
1939
1940 err = -EMSGSIZE;
1941 if (len > sk->sk_sndbuf - 32)
1942 goto out;
1943
1944 if (len > SKB_MAX_ALLOC) {
1945 data_len = min_t(size_t,
1946 len - SKB_MAX_ALLOC,
1947 MAX_SKB_FRAGS * PAGE_SIZE);
1948 data_len = PAGE_ALIGN(data_len);
1949
1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 }
1952
1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 msg->msg_flags & MSG_DONTWAIT, &err,
1955 PAGE_ALLOC_COSTLY_ORDER);
1956 if (skb == NULL)
1957 goto out;
1958
1959 err = unix_scm_to_skb(&scm, skb, true);
1960 if (err < 0)
1961 goto out_free;
1962
1963 skb_put(skb, len - data_len);
1964 skb->data_len = data_len;
1965 skb->len = len;
1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 if (err)
1968 goto out_free;
1969
1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971
1972 restart:
1973 if (!other) {
1974 err = -ECONNRESET;
1975 if (sunaddr == NULL)
1976 goto out_free;
1977
1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 sk->sk_type);
1980 if (IS_ERR(other)) {
1981 err = PTR_ERR(other);
1982 other = NULL;
1983 goto out_free;
1984 }
1985 }
1986
1987 if (sk_filter(other, skb) < 0) {
1988 /* Toss the packet but do not return any error to the sender */
1989 err = len;
1990 goto out_free;
1991 }
1992
1993 sk_locked = 0;
1994 unix_state_lock(other);
1995 restart_locked:
1996 err = -EPERM;
1997 if (!unix_may_send(sk, other))
1998 goto out_unlock;
1999
2000 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 /*
2002 * Check with 1003.1g - what should
2003 * datagram error
2004 */
2005 unix_state_unlock(other);
2006 sock_put(other);
2007
2008 if (!sk_locked)
2009 unix_state_lock(sk);
2010
2011 err = 0;
2012 if (sk->sk_type == SOCK_SEQPACKET) {
2013 /* We are here only when racing with unix_release_sock()
2014 * is clearing @other. Never change state to TCP_CLOSE
2015 * unlike SOCK_DGRAM wants.
2016 */
2017 unix_state_unlock(sk);
2018 err = -EPIPE;
2019 } else if (unix_peer(sk) == other) {
2020 unix_peer(sk) = NULL;
2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022
2023 sk->sk_state = TCP_CLOSE;
2024 unix_state_unlock(sk);
2025
2026 unix_dgram_disconnected(sk, other);
2027 sock_put(other);
2028 err = -ECONNREFUSED;
2029 } else {
2030 unix_state_unlock(sk);
2031 }
2032
2033 other = NULL;
2034 if (err)
2035 goto out_free;
2036 goto restart;
2037 }
2038
2039 err = -EPIPE;
2040 if (other->sk_shutdown & RCV_SHUTDOWN)
2041 goto out_unlock;
2042
2043 if (sk->sk_type != SOCK_SEQPACKET) {
2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 if (err)
2046 goto out_unlock;
2047 }
2048
2049 /* other == sk && unix_peer(other) != sk if
2050 * - unix_peer(sk) == NULL, destination address bound to sk
2051 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 */
2053 if (other != sk &&
2054 unlikely(unix_peer(other) != sk &&
2055 unix_recvq_full_lockless(other))) {
2056 if (timeo) {
2057 timeo = unix_wait_for_peer(other, timeo);
2058
2059 err = sock_intr_errno(timeo);
2060 if (signal_pending(current))
2061 goto out_free;
2062
2063 goto restart;
2064 }
2065
2066 if (!sk_locked) {
2067 unix_state_unlock(other);
2068 unix_state_double_lock(sk, other);
2069 }
2070
2071 if (unix_peer(sk) != other ||
2072 unix_dgram_peer_wake_me(sk, other)) {
2073 err = -EAGAIN;
2074 sk_locked = 1;
2075 goto out_unlock;
2076 }
2077
2078 if (!sk_locked) {
2079 sk_locked = 1;
2080 goto restart_locked;
2081 }
2082 }
2083
2084 if (unlikely(sk_locked))
2085 unix_state_unlock(sk);
2086
2087 if (sock_flag(other, SOCK_RCVTSTAMP))
2088 __net_timestamp(skb);
2089 maybe_add_creds(skb, sock, other);
2090 scm_stat_add(other, skb);
2091 skb_queue_tail(&other->sk_receive_queue, skb);
2092 unix_state_unlock(other);
2093 other->sk_data_ready(other);
2094 sock_put(other);
2095 scm_destroy(&scm);
2096 return len;
2097
2098 out_unlock:
2099 if (sk_locked)
2100 unix_state_unlock(sk);
2101 unix_state_unlock(other);
2102 out_free:
2103 kfree_skb(skb);
2104 out:
2105 if (other)
2106 sock_put(other);
2107 scm_destroy(&scm);
2108 return err;
2109 }
2110
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112 * bytes, and a minimum of a full page.
2113 */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 struct scm_cookie *scm, bool fds_sent)
2119 {
2120 struct unix_sock *ousk = unix_sk(other);
2121 struct sk_buff *skb;
2122 int err = 0;
2123
2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125
2126 if (!skb)
2127 return err;
2128
2129 err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 if (err < 0) {
2131 kfree_skb(skb);
2132 return err;
2133 }
2134 skb_put(skb, 1);
2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136
2137 if (err) {
2138 kfree_skb(skb);
2139 return err;
2140 }
2141
2142 unix_state_lock(other);
2143
2144 if (sock_flag(other, SOCK_DEAD) ||
2145 (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 unix_state_unlock(other);
2147 kfree_skb(skb);
2148 return -EPIPE;
2149 }
2150
2151 maybe_add_creds(skb, sock, other);
2152 skb_get(skb);
2153
2154 if (ousk->oob_skb)
2155 consume_skb(ousk->oob_skb);
2156
2157 WRITE_ONCE(ousk->oob_skb, skb);
2158
2159 scm_stat_add(other, skb);
2160 skb_queue_tail(&other->sk_receive_queue, skb);
2161 sk_send_sigurg(other);
2162 unix_state_unlock(other);
2163 other->sk_data_ready(other);
2164
2165 return err;
2166 }
2167 #endif
2168
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2170 size_t len)
2171 {
2172 struct sock *sk = sock->sk;
2173 struct sock *other = NULL;
2174 int err, size;
2175 struct sk_buff *skb;
2176 int sent = 0;
2177 struct scm_cookie scm;
2178 bool fds_sent = false;
2179 int data_len;
2180
2181 wait_for_unix_gc();
2182 err = scm_send(sock, msg, &scm, false);
2183 if (err < 0)
2184 return err;
2185
2186 err = -EOPNOTSUPP;
2187 if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2189 if (len)
2190 len--;
2191 else
2192 #endif
2193 goto out_err;
2194 }
2195
2196 if (msg->msg_namelen) {
2197 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2198 goto out_err;
2199 } else {
2200 err = -ENOTCONN;
2201 other = unix_peer(sk);
2202 if (!other)
2203 goto out_err;
2204 }
2205
2206 if (sk->sk_shutdown & SEND_SHUTDOWN)
2207 goto pipe_err;
2208
2209 while (sent < len) {
2210 size = len - sent;
2211
2212 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213 skb = sock_alloc_send_pskb(sk, 0, 0,
2214 msg->msg_flags & MSG_DONTWAIT,
2215 &err, 0);
2216 } else {
2217 /* Keep two messages in the pipe so it schedules better */
2218 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2219
2220 /* allow fallback to order-0 allocations */
2221 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2222
2223 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2224
2225 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2226
2227 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228 msg->msg_flags & MSG_DONTWAIT, &err,
2229 get_order(UNIX_SKB_FRAGS_SZ));
2230 }
2231 if (!skb)
2232 goto out_err;
2233
2234 /* Only send the fds in the first buffer */
2235 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2236 if (err < 0) {
2237 kfree_skb(skb);
2238 goto out_err;
2239 }
2240 fds_sent = true;
2241
2242 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244 sk->sk_allocation);
2245 if (err < 0) {
2246 kfree_skb(skb);
2247 goto out_err;
2248 }
2249 size = err;
2250 refcount_add(size, &sk->sk_wmem_alloc);
2251 } else {
2252 skb_put(skb, size - data_len);
2253 skb->data_len = data_len;
2254 skb->len = size;
2255 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256 if (err) {
2257 kfree_skb(skb);
2258 goto out_err;
2259 }
2260 }
2261
2262 unix_state_lock(other);
2263
2264 if (sock_flag(other, SOCK_DEAD) ||
2265 (other->sk_shutdown & RCV_SHUTDOWN))
2266 goto pipe_err_free;
2267
2268 maybe_add_creds(skb, sock, other);
2269 scm_stat_add(other, skb);
2270 skb_queue_tail(&other->sk_receive_queue, skb);
2271 unix_state_unlock(other);
2272 other->sk_data_ready(other);
2273 sent += size;
2274 }
2275
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277 if (msg->msg_flags & MSG_OOB) {
2278 err = queue_oob(sock, msg, other, &scm, fds_sent);
2279 if (err)
2280 goto out_err;
2281 sent++;
2282 }
2283 #endif
2284
2285 scm_destroy(&scm);
2286
2287 return sent;
2288
2289 pipe_err_free:
2290 unix_state_unlock(other);
2291 kfree_skb(skb);
2292 pipe_err:
2293 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294 send_sig(SIGPIPE, current, 0);
2295 err = -EPIPE;
2296 out_err:
2297 scm_destroy(&scm);
2298 return sent ? : err;
2299 }
2300
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302 size_t len)
2303 {
2304 int err;
2305 struct sock *sk = sock->sk;
2306
2307 err = sock_error(sk);
2308 if (err)
2309 return err;
2310
2311 if (sk->sk_state != TCP_ESTABLISHED)
2312 return -ENOTCONN;
2313
2314 if (msg->msg_namelen)
2315 msg->msg_namelen = 0;
2316
2317 return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321 size_t size, int flags)
2322 {
2323 struct sock *sk = sock->sk;
2324
2325 if (sk->sk_state != TCP_ESTABLISHED)
2326 return -ENOTCONN;
2327
2328 return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330
unix_copy_addr(struct msghdr * msg,struct sock * sk)2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334
2335 if (addr) {
2336 msg->msg_namelen = addr->len;
2337 memcpy(msg->msg_name, addr->name, addr->len);
2338 }
2339 }
2340
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342 int flags)
2343 {
2344 struct scm_cookie scm;
2345 struct socket *sock = sk->sk_socket;
2346 struct unix_sock *u = unix_sk(sk);
2347 struct sk_buff *skb, *last;
2348 long timeo;
2349 int skip;
2350 int err;
2351
2352 err = -EOPNOTSUPP;
2353 if (flags&MSG_OOB)
2354 goto out;
2355
2356 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357
2358 do {
2359 mutex_lock(&u->iolock);
2360
2361 skip = sk_peek_offset(sk, flags);
2362 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363 &skip, &err, &last);
2364 if (skb) {
2365 if (!(flags & MSG_PEEK))
2366 scm_stat_del(sk, skb);
2367 break;
2368 }
2369
2370 mutex_unlock(&u->iolock);
2371
2372 if (err != -EAGAIN)
2373 break;
2374 } while (timeo &&
2375 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376 &err, &timeo, last));
2377
2378 if (!skb) { /* implies iolock unlocked */
2379 unix_state_lock(sk);
2380 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382 (sk->sk_shutdown & RCV_SHUTDOWN))
2383 err = 0;
2384 unix_state_unlock(sk);
2385 goto out;
2386 }
2387
2388 if (wq_has_sleeper(&u->peer_wait))
2389 wake_up_interruptible_sync_poll(&u->peer_wait,
2390 EPOLLOUT | EPOLLWRNORM |
2391 EPOLLWRBAND);
2392
2393 if (msg->msg_name)
2394 unix_copy_addr(msg, skb->sk);
2395
2396 if (size > skb->len - skip)
2397 size = skb->len - skip;
2398 else if (size < skb->len - skip)
2399 msg->msg_flags |= MSG_TRUNC;
2400
2401 err = skb_copy_datagram_msg(skb, skip, msg, size);
2402 if (err)
2403 goto out_free;
2404
2405 if (sock_flag(sk, SOCK_RCVTSTAMP))
2406 __sock_recv_timestamp(msg, sk, skb);
2407
2408 memset(&scm, 0, sizeof(scm));
2409
2410 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411 unix_set_secdata(&scm, skb);
2412
2413 if (!(flags & MSG_PEEK)) {
2414 if (UNIXCB(skb).fp)
2415 unix_detach_fds(&scm, skb);
2416
2417 sk_peek_offset_bwd(sk, skb->len);
2418 } else {
2419 /* It is questionable: on PEEK we could:
2420 - do not return fds - good, but too simple 8)
2421 - return fds, and do not return them on read (old strategy,
2422 apparently wrong)
2423 - clone fds (I chose it for now, it is the most universal
2424 solution)
2425
2426 POSIX 1003.1g does not actually define this clearly
2427 at all. POSIX 1003.1g doesn't define a lot of things
2428 clearly however!
2429
2430 */
2431
2432 sk_peek_offset_fwd(sk, size);
2433
2434 if (UNIXCB(skb).fp)
2435 unix_peek_fds(&scm, skb);
2436 }
2437 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438
2439 scm_recv_unix(sock, msg, &scm, flags);
2440
2441 out_free:
2442 skb_free_datagram(sk, skb);
2443 mutex_unlock(&u->iolock);
2444 out:
2445 return err;
2446 }
2447
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449 int flags)
2450 {
2451 struct sock *sk = sock->sk;
2452
2453 #ifdef CONFIG_BPF_SYSCALL
2454 const struct proto *prot = READ_ONCE(sk->sk_prot);
2455
2456 if (prot != &unix_dgram_proto)
2457 return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459 return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464 struct unix_sock *u = unix_sk(sk);
2465 struct sk_buff *skb;
2466 int err;
2467
2468 mutex_lock(&u->iolock);
2469 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470 mutex_unlock(&u->iolock);
2471 if (!skb)
2472 return err;
2473
2474 return recv_actor(sk, skb);
2475 }
2476
2477 /*
2478 * Sleep until more data has arrived. But check for races..
2479 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481 struct sk_buff *last, unsigned int last_len,
2482 bool freezable)
2483 {
2484 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485 struct sk_buff *tail;
2486 DEFINE_WAIT(wait);
2487
2488 unix_state_lock(sk);
2489
2490 for (;;) {
2491 prepare_to_wait(sk_sleep(sk), &wait, state);
2492
2493 tail = skb_peek_tail(&sk->sk_receive_queue);
2494 if (tail != last ||
2495 (tail && tail->len != last_len) ||
2496 sk->sk_err ||
2497 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498 signal_pending(current) ||
2499 !timeo)
2500 break;
2501
2502 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503 unix_state_unlock(sk);
2504 timeo = schedule_timeout(timeo);
2505 unix_state_lock(sk);
2506
2507 if (sock_flag(sk, SOCK_DEAD))
2508 break;
2509
2510 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511 }
2512
2513 finish_wait(sk_sleep(sk), &wait);
2514 unix_state_unlock(sk);
2515 return timeo;
2516 }
2517
unix_skb_len(const struct sk_buff * skb)2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520 return skb->len - UNIXCB(skb).consumed;
2521 }
2522
2523 struct unix_stream_read_state {
2524 int (*recv_actor)(struct sk_buff *, int, int,
2525 struct unix_stream_read_state *);
2526 struct socket *socket;
2527 struct msghdr *msg;
2528 struct pipe_inode_info *pipe;
2529 size_t size;
2530 int flags;
2531 unsigned int splice_flags;
2532 };
2533
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537 struct socket *sock = state->socket;
2538 struct sock *sk = sock->sk;
2539 struct unix_sock *u = unix_sk(sk);
2540 int chunk = 1;
2541 struct sk_buff *oob_skb;
2542
2543 mutex_lock(&u->iolock);
2544 unix_state_lock(sk);
2545
2546 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547 unix_state_unlock(sk);
2548 mutex_unlock(&u->iolock);
2549 return -EINVAL;
2550 }
2551
2552 oob_skb = u->oob_skb;
2553
2554 if (!(state->flags & MSG_PEEK))
2555 WRITE_ONCE(u->oob_skb, NULL);
2556
2557 unix_state_unlock(sk);
2558
2559 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2560
2561 if (!(state->flags & MSG_PEEK)) {
2562 UNIXCB(oob_skb).consumed += 1;
2563 kfree_skb(oob_skb);
2564 }
2565
2566 mutex_unlock(&u->iolock);
2567
2568 if (chunk < 0)
2569 return -EFAULT;
2570
2571 state->msg->msg_flags |= MSG_OOB;
2572 return 1;
2573 }
2574
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2575 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2576 int flags, int copied)
2577 {
2578 struct unix_sock *u = unix_sk(sk);
2579
2580 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2581 skb_unlink(skb, &sk->sk_receive_queue);
2582 consume_skb(skb);
2583 skb = NULL;
2584 } else {
2585 if (skb == u->oob_skb) {
2586 if (copied) {
2587 skb = NULL;
2588 } else if (sock_flag(sk, SOCK_URGINLINE)) {
2589 if (!(flags & MSG_PEEK)) {
2590 WRITE_ONCE(u->oob_skb, NULL);
2591 consume_skb(skb);
2592 }
2593 } else if (!(flags & MSG_PEEK)) {
2594 skb_unlink(skb, &sk->sk_receive_queue);
2595 consume_skb(skb);
2596 skb = skb_peek(&sk->sk_receive_queue);
2597 }
2598 }
2599 }
2600 return skb;
2601 }
2602 #endif
2603
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2604 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2605 {
2606 if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2607 return -ENOTCONN;
2608
2609 return unix_read_skb(sk, recv_actor);
2610 }
2611
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2612 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2613 bool freezable)
2614 {
2615 struct scm_cookie scm;
2616 struct socket *sock = state->socket;
2617 struct sock *sk = sock->sk;
2618 struct unix_sock *u = unix_sk(sk);
2619 int copied = 0;
2620 int flags = state->flags;
2621 int noblock = flags & MSG_DONTWAIT;
2622 bool check_creds = false;
2623 int target;
2624 int err = 0;
2625 long timeo;
2626 int skip;
2627 size_t size = state->size;
2628 unsigned int last_len;
2629
2630 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2631 err = -EINVAL;
2632 goto out;
2633 }
2634
2635 if (unlikely(flags & MSG_OOB)) {
2636 err = -EOPNOTSUPP;
2637 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2638 err = unix_stream_recv_urg(state);
2639 #endif
2640 goto out;
2641 }
2642
2643 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2644 timeo = sock_rcvtimeo(sk, noblock);
2645
2646 memset(&scm, 0, sizeof(scm));
2647
2648 /* Lock the socket to prevent queue disordering
2649 * while sleeps in memcpy_tomsg
2650 */
2651 mutex_lock(&u->iolock);
2652
2653 skip = max(sk_peek_offset(sk, flags), 0);
2654
2655 do {
2656 int chunk;
2657 bool drop_skb;
2658 struct sk_buff *skb, *last;
2659
2660 redo:
2661 unix_state_lock(sk);
2662 if (sock_flag(sk, SOCK_DEAD)) {
2663 err = -ECONNRESET;
2664 goto unlock;
2665 }
2666 last = skb = skb_peek(&sk->sk_receive_queue);
2667 last_len = last ? last->len : 0;
2668
2669 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2670 if (skb) {
2671 skb = manage_oob(skb, sk, flags, copied);
2672 if (!skb) {
2673 unix_state_unlock(sk);
2674 if (copied)
2675 break;
2676 goto redo;
2677 }
2678 }
2679 #endif
2680 again:
2681 if (skb == NULL) {
2682 if (copied >= target)
2683 goto unlock;
2684
2685 /*
2686 * POSIX 1003.1g mandates this order.
2687 */
2688
2689 err = sock_error(sk);
2690 if (err)
2691 goto unlock;
2692 if (sk->sk_shutdown & RCV_SHUTDOWN)
2693 goto unlock;
2694
2695 unix_state_unlock(sk);
2696 if (!timeo) {
2697 err = -EAGAIN;
2698 break;
2699 }
2700
2701 mutex_unlock(&u->iolock);
2702
2703 timeo = unix_stream_data_wait(sk, timeo, last,
2704 last_len, freezable);
2705
2706 if (signal_pending(current)) {
2707 err = sock_intr_errno(timeo);
2708 scm_destroy(&scm);
2709 goto out;
2710 }
2711
2712 mutex_lock(&u->iolock);
2713 goto redo;
2714 unlock:
2715 unix_state_unlock(sk);
2716 break;
2717 }
2718
2719 while (skip >= unix_skb_len(skb)) {
2720 skip -= unix_skb_len(skb);
2721 last = skb;
2722 last_len = skb->len;
2723 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2724 if (!skb)
2725 goto again;
2726 }
2727
2728 unix_state_unlock(sk);
2729
2730 if (check_creds) {
2731 /* Never glue messages from different writers */
2732 if (!unix_skb_scm_eq(skb, &scm))
2733 break;
2734 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2735 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2736 /* Copy credentials */
2737 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2738 unix_set_secdata(&scm, skb);
2739 check_creds = true;
2740 }
2741
2742 /* Copy address just once */
2743 if (state->msg && state->msg->msg_name) {
2744 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2745 state->msg->msg_name);
2746 unix_copy_addr(state->msg, skb->sk);
2747 sunaddr = NULL;
2748 }
2749
2750 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2751 skb_get(skb);
2752 chunk = state->recv_actor(skb, skip, chunk, state);
2753 drop_skb = !unix_skb_len(skb);
2754 /* skb is only safe to use if !drop_skb */
2755 consume_skb(skb);
2756 if (chunk < 0) {
2757 if (copied == 0)
2758 copied = -EFAULT;
2759 break;
2760 }
2761 copied += chunk;
2762 size -= chunk;
2763
2764 if (drop_skb) {
2765 /* the skb was touched by a concurrent reader;
2766 * we should not expect anything from this skb
2767 * anymore and assume it invalid - we can be
2768 * sure it was dropped from the socket queue
2769 *
2770 * let's report a short read
2771 */
2772 err = 0;
2773 break;
2774 }
2775
2776 /* Mark read part of skb as used */
2777 if (!(flags & MSG_PEEK)) {
2778 UNIXCB(skb).consumed += chunk;
2779
2780 sk_peek_offset_bwd(sk, chunk);
2781
2782 if (UNIXCB(skb).fp) {
2783 scm_stat_del(sk, skb);
2784 unix_detach_fds(&scm, skb);
2785 }
2786
2787 if (unix_skb_len(skb))
2788 break;
2789
2790 skb_unlink(skb, &sk->sk_receive_queue);
2791 consume_skb(skb);
2792
2793 if (scm.fp)
2794 break;
2795 } else {
2796 /* It is questionable, see note in unix_dgram_recvmsg.
2797 */
2798 if (UNIXCB(skb).fp)
2799 unix_peek_fds(&scm, skb);
2800
2801 sk_peek_offset_fwd(sk, chunk);
2802
2803 if (UNIXCB(skb).fp)
2804 break;
2805
2806 skip = 0;
2807 last = skb;
2808 last_len = skb->len;
2809 unix_state_lock(sk);
2810 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2811 if (skb)
2812 goto again;
2813 unix_state_unlock(sk);
2814 break;
2815 }
2816 } while (size);
2817
2818 mutex_unlock(&u->iolock);
2819 if (state->msg)
2820 scm_recv_unix(sock, state->msg, &scm, flags);
2821 else
2822 scm_destroy(&scm);
2823 out:
2824 return copied ? : err;
2825 }
2826
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2827 static int unix_stream_read_actor(struct sk_buff *skb,
2828 int skip, int chunk,
2829 struct unix_stream_read_state *state)
2830 {
2831 int ret;
2832
2833 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2834 state->msg, chunk);
2835 return ret ?: chunk;
2836 }
2837
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2838 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2839 size_t size, int flags)
2840 {
2841 struct unix_stream_read_state state = {
2842 .recv_actor = unix_stream_read_actor,
2843 .socket = sk->sk_socket,
2844 .msg = msg,
2845 .size = size,
2846 .flags = flags
2847 };
2848
2849 return unix_stream_read_generic(&state, true);
2850 }
2851
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2852 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2853 size_t size, int flags)
2854 {
2855 struct unix_stream_read_state state = {
2856 .recv_actor = unix_stream_read_actor,
2857 .socket = sock,
2858 .msg = msg,
2859 .size = size,
2860 .flags = flags
2861 };
2862
2863 #ifdef CONFIG_BPF_SYSCALL
2864 struct sock *sk = sock->sk;
2865 const struct proto *prot = READ_ONCE(sk->sk_prot);
2866
2867 if (prot != &unix_stream_proto)
2868 return prot->recvmsg(sk, msg, size, flags, NULL);
2869 #endif
2870 return unix_stream_read_generic(&state, true);
2871 }
2872
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2873 static int unix_stream_splice_actor(struct sk_buff *skb,
2874 int skip, int chunk,
2875 struct unix_stream_read_state *state)
2876 {
2877 return skb_splice_bits(skb, state->socket->sk,
2878 UNIXCB(skb).consumed + skip,
2879 state->pipe, chunk, state->splice_flags);
2880 }
2881
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2882 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2883 struct pipe_inode_info *pipe,
2884 size_t size, unsigned int flags)
2885 {
2886 struct unix_stream_read_state state = {
2887 .recv_actor = unix_stream_splice_actor,
2888 .socket = sock,
2889 .pipe = pipe,
2890 .size = size,
2891 .splice_flags = flags,
2892 };
2893
2894 if (unlikely(*ppos))
2895 return -ESPIPE;
2896
2897 if (sock->file->f_flags & O_NONBLOCK ||
2898 flags & SPLICE_F_NONBLOCK)
2899 state.flags = MSG_DONTWAIT;
2900
2901 return unix_stream_read_generic(&state, false);
2902 }
2903
unix_shutdown(struct socket * sock,int mode)2904 static int unix_shutdown(struct socket *sock, int mode)
2905 {
2906 struct sock *sk = sock->sk;
2907 struct sock *other;
2908
2909 if (mode < SHUT_RD || mode > SHUT_RDWR)
2910 return -EINVAL;
2911 /* This maps:
2912 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2913 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2914 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2915 */
2916 ++mode;
2917
2918 unix_state_lock(sk);
2919 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2920 other = unix_peer(sk);
2921 if (other)
2922 sock_hold(other);
2923 unix_state_unlock(sk);
2924 sk->sk_state_change(sk);
2925
2926 if (other &&
2927 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2928
2929 int peer_mode = 0;
2930 const struct proto *prot = READ_ONCE(other->sk_prot);
2931
2932 if (prot->unhash)
2933 prot->unhash(other);
2934 if (mode&RCV_SHUTDOWN)
2935 peer_mode |= SEND_SHUTDOWN;
2936 if (mode&SEND_SHUTDOWN)
2937 peer_mode |= RCV_SHUTDOWN;
2938 unix_state_lock(other);
2939 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2940 unix_state_unlock(other);
2941 other->sk_state_change(other);
2942 if (peer_mode == SHUTDOWN_MASK)
2943 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2944 else if (peer_mode & RCV_SHUTDOWN)
2945 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2946 }
2947 if (other)
2948 sock_put(other);
2949
2950 return 0;
2951 }
2952
unix_inq_len(struct sock * sk)2953 long unix_inq_len(struct sock *sk)
2954 {
2955 struct sk_buff *skb;
2956 long amount = 0;
2957
2958 if (sk->sk_state == TCP_LISTEN)
2959 return -EINVAL;
2960
2961 spin_lock(&sk->sk_receive_queue.lock);
2962 if (sk->sk_type == SOCK_STREAM ||
2963 sk->sk_type == SOCK_SEQPACKET) {
2964 skb_queue_walk(&sk->sk_receive_queue, skb)
2965 amount += unix_skb_len(skb);
2966 } else {
2967 skb = skb_peek(&sk->sk_receive_queue);
2968 if (skb)
2969 amount = skb->len;
2970 }
2971 spin_unlock(&sk->sk_receive_queue.lock);
2972
2973 return amount;
2974 }
2975 EXPORT_SYMBOL_GPL(unix_inq_len);
2976
unix_outq_len(struct sock * sk)2977 long unix_outq_len(struct sock *sk)
2978 {
2979 return sk_wmem_alloc_get(sk);
2980 }
2981 EXPORT_SYMBOL_GPL(unix_outq_len);
2982
unix_open_file(struct sock * sk)2983 static int unix_open_file(struct sock *sk)
2984 {
2985 struct path path;
2986 struct file *f;
2987 int fd;
2988
2989 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2990 return -EPERM;
2991
2992 if (!smp_load_acquire(&unix_sk(sk)->addr))
2993 return -ENOENT;
2994
2995 path = unix_sk(sk)->path;
2996 if (!path.dentry)
2997 return -ENOENT;
2998
2999 path_get(&path);
3000
3001 fd = get_unused_fd_flags(O_CLOEXEC);
3002 if (fd < 0)
3003 goto out;
3004
3005 f = dentry_open(&path, O_PATH, current_cred());
3006 if (IS_ERR(f)) {
3007 put_unused_fd(fd);
3008 fd = PTR_ERR(f);
3009 goto out;
3010 }
3011
3012 fd_install(fd, f);
3013 out:
3014 path_put(&path);
3015
3016 return fd;
3017 }
3018
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3019 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3020 {
3021 struct sock *sk = sock->sk;
3022 long amount = 0;
3023 int err;
3024
3025 switch (cmd) {
3026 case SIOCOUTQ:
3027 amount = unix_outq_len(sk);
3028 err = put_user(amount, (int __user *)arg);
3029 break;
3030 case SIOCINQ:
3031 amount = unix_inq_len(sk);
3032 if (amount < 0)
3033 err = amount;
3034 else
3035 err = put_user(amount, (int __user *)arg);
3036 break;
3037 case SIOCUNIXFILE:
3038 err = unix_open_file(sk);
3039 break;
3040 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3041 case SIOCATMARK:
3042 {
3043 struct sk_buff *skb;
3044 int answ = 0;
3045
3046 skb = skb_peek(&sk->sk_receive_queue);
3047 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3048 answ = 1;
3049 err = put_user(answ, (int __user *)arg);
3050 }
3051 break;
3052 #endif
3053 default:
3054 err = -ENOIOCTLCMD;
3055 break;
3056 }
3057 return err;
3058 }
3059
3060 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3061 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3062 {
3063 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3064 }
3065 #endif
3066
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3067 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3068 {
3069 struct sock *sk = sock->sk;
3070 __poll_t mask;
3071 u8 shutdown;
3072
3073 sock_poll_wait(file, sock, wait);
3074 mask = 0;
3075 shutdown = READ_ONCE(sk->sk_shutdown);
3076
3077 /* exceptional events? */
3078 if (READ_ONCE(sk->sk_err))
3079 mask |= EPOLLERR;
3080 if (shutdown == SHUTDOWN_MASK)
3081 mask |= EPOLLHUP;
3082 if (shutdown & RCV_SHUTDOWN)
3083 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3084
3085 /* readable? */
3086 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3087 mask |= EPOLLIN | EPOLLRDNORM;
3088 if (sk_is_readable(sk))
3089 mask |= EPOLLIN | EPOLLRDNORM;
3090 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3091 if (READ_ONCE(unix_sk(sk)->oob_skb))
3092 mask |= EPOLLPRI;
3093 #endif
3094
3095 /* Connection-based need to check for termination and startup */
3096 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3097 sk->sk_state == TCP_CLOSE)
3098 mask |= EPOLLHUP;
3099
3100 /*
3101 * we set writable also when the other side has shut down the
3102 * connection. This prevents stuck sockets.
3103 */
3104 if (unix_writable(sk))
3105 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3106
3107 return mask;
3108 }
3109
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3110 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3111 poll_table *wait)
3112 {
3113 struct sock *sk = sock->sk, *other;
3114 unsigned int writable;
3115 __poll_t mask;
3116 u8 shutdown;
3117
3118 sock_poll_wait(file, sock, wait);
3119 mask = 0;
3120 shutdown = READ_ONCE(sk->sk_shutdown);
3121
3122 /* exceptional events? */
3123 if (READ_ONCE(sk->sk_err) ||
3124 !skb_queue_empty_lockless(&sk->sk_error_queue))
3125 mask |= EPOLLERR |
3126 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3127
3128 if (shutdown & RCV_SHUTDOWN)
3129 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3130 if (shutdown == SHUTDOWN_MASK)
3131 mask |= EPOLLHUP;
3132
3133 /* readable? */
3134 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3135 mask |= EPOLLIN | EPOLLRDNORM;
3136 if (sk_is_readable(sk))
3137 mask |= EPOLLIN | EPOLLRDNORM;
3138
3139 /* Connection-based need to check for termination and startup */
3140 if (sk->sk_type == SOCK_SEQPACKET) {
3141 if (sk->sk_state == TCP_CLOSE)
3142 mask |= EPOLLHUP;
3143 /* connection hasn't started yet? */
3144 if (sk->sk_state == TCP_SYN_SENT)
3145 return mask;
3146 }
3147
3148 /* No write status requested, avoid expensive OUT tests. */
3149 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3150 return mask;
3151
3152 writable = unix_writable(sk);
3153 if (writable) {
3154 unix_state_lock(sk);
3155
3156 other = unix_peer(sk);
3157 if (other && unix_peer(other) != sk &&
3158 unix_recvq_full_lockless(other) &&
3159 unix_dgram_peer_wake_me(sk, other))
3160 writable = 0;
3161
3162 unix_state_unlock(sk);
3163 }
3164
3165 if (writable)
3166 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167 else
3168 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3169
3170 return mask;
3171 }
3172
3173 #ifdef CONFIG_PROC_FS
3174
3175 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3176
3177 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3178 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3179 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3180
unix_from_bucket(struct seq_file * seq,loff_t * pos)3181 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3182 {
3183 unsigned long offset = get_offset(*pos);
3184 unsigned long bucket = get_bucket(*pos);
3185 unsigned long count = 0;
3186 struct sock *sk;
3187
3188 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3189 sk; sk = sk_next(sk)) {
3190 if (++count == offset)
3191 break;
3192 }
3193
3194 return sk;
3195 }
3196
unix_get_first(struct seq_file * seq,loff_t * pos)3197 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3198 {
3199 unsigned long bucket = get_bucket(*pos);
3200 struct net *net = seq_file_net(seq);
3201 struct sock *sk;
3202
3203 while (bucket < UNIX_HASH_SIZE) {
3204 spin_lock(&net->unx.table.locks[bucket]);
3205
3206 sk = unix_from_bucket(seq, pos);
3207 if (sk)
3208 return sk;
3209
3210 spin_unlock(&net->unx.table.locks[bucket]);
3211
3212 *pos = set_bucket_offset(++bucket, 1);
3213 }
3214
3215 return NULL;
3216 }
3217
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3218 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3219 loff_t *pos)
3220 {
3221 unsigned long bucket = get_bucket(*pos);
3222
3223 sk = sk_next(sk);
3224 if (sk)
3225 return sk;
3226
3227
3228 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3229
3230 *pos = set_bucket_offset(++bucket, 1);
3231
3232 return unix_get_first(seq, pos);
3233 }
3234
unix_seq_start(struct seq_file * seq,loff_t * pos)3235 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3236 {
3237 if (!*pos)
3238 return SEQ_START_TOKEN;
3239
3240 return unix_get_first(seq, pos);
3241 }
3242
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3243 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3244 {
3245 ++*pos;
3246
3247 if (v == SEQ_START_TOKEN)
3248 return unix_get_first(seq, pos);
3249
3250 return unix_get_next(seq, v, pos);
3251 }
3252
unix_seq_stop(struct seq_file * seq,void * v)3253 static void unix_seq_stop(struct seq_file *seq, void *v)
3254 {
3255 struct sock *sk = v;
3256
3257 if (sk)
3258 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3259 }
3260
unix_seq_show(struct seq_file * seq,void * v)3261 static int unix_seq_show(struct seq_file *seq, void *v)
3262 {
3263
3264 if (v == SEQ_START_TOKEN)
3265 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3266 "Inode Path\n");
3267 else {
3268 struct sock *s = v;
3269 struct unix_sock *u = unix_sk(s);
3270 unix_state_lock(s);
3271
3272 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3273 s,
3274 refcount_read(&s->sk_refcnt),
3275 0,
3276 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3277 s->sk_type,
3278 s->sk_socket ?
3279 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3280 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3281 sock_i_ino(s));
3282
3283 if (u->addr) { // under a hash table lock here
3284 int i, len;
3285 seq_putc(seq, ' ');
3286
3287 i = 0;
3288 len = u->addr->len -
3289 offsetof(struct sockaddr_un, sun_path);
3290 if (u->addr->name->sun_path[0]) {
3291 len--;
3292 } else {
3293 seq_putc(seq, '@');
3294 i++;
3295 }
3296 for ( ; i < len; i++)
3297 seq_putc(seq, u->addr->name->sun_path[i] ?:
3298 '@');
3299 }
3300 unix_state_unlock(s);
3301 seq_putc(seq, '\n');
3302 }
3303
3304 return 0;
3305 }
3306
3307 static const struct seq_operations unix_seq_ops = {
3308 .start = unix_seq_start,
3309 .next = unix_seq_next,
3310 .stop = unix_seq_stop,
3311 .show = unix_seq_show,
3312 };
3313
3314 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3315 struct bpf_unix_iter_state {
3316 struct seq_net_private p;
3317 unsigned int cur_sk;
3318 unsigned int end_sk;
3319 unsigned int max_sk;
3320 struct sock **batch;
3321 bool st_bucket_done;
3322 };
3323
3324 struct bpf_iter__unix {
3325 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3326 __bpf_md_ptr(struct unix_sock *, unix_sk);
3327 uid_t uid __aligned(8);
3328 };
3329
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3330 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3331 struct unix_sock *unix_sk, uid_t uid)
3332 {
3333 struct bpf_iter__unix ctx;
3334
3335 meta->seq_num--; /* skip SEQ_START_TOKEN */
3336 ctx.meta = meta;
3337 ctx.unix_sk = unix_sk;
3338 ctx.uid = uid;
3339 return bpf_iter_run_prog(prog, &ctx);
3340 }
3341
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3342 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3343
3344 {
3345 struct bpf_unix_iter_state *iter = seq->private;
3346 unsigned int expected = 1;
3347 struct sock *sk;
3348
3349 sock_hold(start_sk);
3350 iter->batch[iter->end_sk++] = start_sk;
3351
3352 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3353 if (iter->end_sk < iter->max_sk) {
3354 sock_hold(sk);
3355 iter->batch[iter->end_sk++] = sk;
3356 }
3357
3358 expected++;
3359 }
3360
3361 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3362
3363 return expected;
3364 }
3365
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3366 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3367 {
3368 while (iter->cur_sk < iter->end_sk)
3369 sock_put(iter->batch[iter->cur_sk++]);
3370 }
3371
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3372 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3373 unsigned int new_batch_sz)
3374 {
3375 struct sock **new_batch;
3376
3377 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3378 GFP_USER | __GFP_NOWARN);
3379 if (!new_batch)
3380 return -ENOMEM;
3381
3382 bpf_iter_unix_put_batch(iter);
3383 kvfree(iter->batch);
3384 iter->batch = new_batch;
3385 iter->max_sk = new_batch_sz;
3386
3387 return 0;
3388 }
3389
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3390 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3391 loff_t *pos)
3392 {
3393 struct bpf_unix_iter_state *iter = seq->private;
3394 unsigned int expected;
3395 bool resized = false;
3396 struct sock *sk;
3397
3398 if (iter->st_bucket_done)
3399 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3400
3401 again:
3402 /* Get a new batch */
3403 iter->cur_sk = 0;
3404 iter->end_sk = 0;
3405
3406 sk = unix_get_first(seq, pos);
3407 if (!sk)
3408 return NULL; /* Done */
3409
3410 expected = bpf_iter_unix_hold_batch(seq, sk);
3411
3412 if (iter->end_sk == expected) {
3413 iter->st_bucket_done = true;
3414 return sk;
3415 }
3416
3417 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3418 resized = true;
3419 goto again;
3420 }
3421
3422 return sk;
3423 }
3424
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3425 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3426 {
3427 if (!*pos)
3428 return SEQ_START_TOKEN;
3429
3430 /* bpf iter does not support lseek, so it always
3431 * continue from where it was stop()-ped.
3432 */
3433 return bpf_iter_unix_batch(seq, pos);
3434 }
3435
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3436 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3437 {
3438 struct bpf_unix_iter_state *iter = seq->private;
3439 struct sock *sk;
3440
3441 /* Whenever seq_next() is called, the iter->cur_sk is
3442 * done with seq_show(), so advance to the next sk in
3443 * the batch.
3444 */
3445 if (iter->cur_sk < iter->end_sk)
3446 sock_put(iter->batch[iter->cur_sk++]);
3447
3448 ++*pos;
3449
3450 if (iter->cur_sk < iter->end_sk)
3451 sk = iter->batch[iter->cur_sk];
3452 else
3453 sk = bpf_iter_unix_batch(seq, pos);
3454
3455 return sk;
3456 }
3457
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3458 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3459 {
3460 struct bpf_iter_meta meta;
3461 struct bpf_prog *prog;
3462 struct sock *sk = v;
3463 uid_t uid;
3464 bool slow;
3465 int ret;
3466
3467 if (v == SEQ_START_TOKEN)
3468 return 0;
3469
3470 slow = lock_sock_fast(sk);
3471
3472 if (unlikely(sk_unhashed(sk))) {
3473 ret = SEQ_SKIP;
3474 goto unlock;
3475 }
3476
3477 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3478 meta.seq = seq;
3479 prog = bpf_iter_get_info(&meta, false);
3480 ret = unix_prog_seq_show(prog, &meta, v, uid);
3481 unlock:
3482 unlock_sock_fast(sk, slow);
3483 return ret;
3484 }
3485
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3486 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3487 {
3488 struct bpf_unix_iter_state *iter = seq->private;
3489 struct bpf_iter_meta meta;
3490 struct bpf_prog *prog;
3491
3492 if (!v) {
3493 meta.seq = seq;
3494 prog = bpf_iter_get_info(&meta, true);
3495 if (prog)
3496 (void)unix_prog_seq_show(prog, &meta, v, 0);
3497 }
3498
3499 if (iter->cur_sk < iter->end_sk)
3500 bpf_iter_unix_put_batch(iter);
3501 }
3502
3503 static const struct seq_operations bpf_iter_unix_seq_ops = {
3504 .start = bpf_iter_unix_seq_start,
3505 .next = bpf_iter_unix_seq_next,
3506 .stop = bpf_iter_unix_seq_stop,
3507 .show = bpf_iter_unix_seq_show,
3508 };
3509 #endif
3510 #endif
3511
3512 static const struct net_proto_family unix_family_ops = {
3513 .family = PF_UNIX,
3514 .create = unix_create,
3515 .owner = THIS_MODULE,
3516 };
3517
3518
unix_net_init(struct net * net)3519 static int __net_init unix_net_init(struct net *net)
3520 {
3521 int i;
3522
3523 net->unx.sysctl_max_dgram_qlen = 10;
3524 if (unix_sysctl_register(net))
3525 goto out;
3526
3527 #ifdef CONFIG_PROC_FS
3528 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3529 sizeof(struct seq_net_private)))
3530 goto err_sysctl;
3531 #endif
3532
3533 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3534 sizeof(spinlock_t), GFP_KERNEL);
3535 if (!net->unx.table.locks)
3536 goto err_proc;
3537
3538 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3539 sizeof(struct hlist_head),
3540 GFP_KERNEL);
3541 if (!net->unx.table.buckets)
3542 goto free_locks;
3543
3544 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3545 spin_lock_init(&net->unx.table.locks[i]);
3546 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3547 }
3548
3549 return 0;
3550
3551 free_locks:
3552 kvfree(net->unx.table.locks);
3553 err_proc:
3554 #ifdef CONFIG_PROC_FS
3555 remove_proc_entry("unix", net->proc_net);
3556 err_sysctl:
3557 #endif
3558 unix_sysctl_unregister(net);
3559 out:
3560 return -ENOMEM;
3561 }
3562
unix_net_exit(struct net * net)3563 static void __net_exit unix_net_exit(struct net *net)
3564 {
3565 kvfree(net->unx.table.buckets);
3566 kvfree(net->unx.table.locks);
3567 unix_sysctl_unregister(net);
3568 remove_proc_entry("unix", net->proc_net);
3569 }
3570
3571 static struct pernet_operations unix_net_ops = {
3572 .init = unix_net_init,
3573 .exit = unix_net_exit,
3574 };
3575
3576 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3577 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3578 struct unix_sock *unix_sk, uid_t uid)
3579
3580 #define INIT_BATCH_SZ 16
3581
3582 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3583 {
3584 struct bpf_unix_iter_state *iter = priv_data;
3585 int err;
3586
3587 err = bpf_iter_init_seq_net(priv_data, aux);
3588 if (err)
3589 return err;
3590
3591 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3592 if (err) {
3593 bpf_iter_fini_seq_net(priv_data);
3594 return err;
3595 }
3596
3597 return 0;
3598 }
3599
bpf_iter_fini_unix(void * priv_data)3600 static void bpf_iter_fini_unix(void *priv_data)
3601 {
3602 struct bpf_unix_iter_state *iter = priv_data;
3603
3604 bpf_iter_fini_seq_net(priv_data);
3605 kvfree(iter->batch);
3606 }
3607
3608 static const struct bpf_iter_seq_info unix_seq_info = {
3609 .seq_ops = &bpf_iter_unix_seq_ops,
3610 .init_seq_private = bpf_iter_init_unix,
3611 .fini_seq_private = bpf_iter_fini_unix,
3612 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3613 };
3614
3615 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3616 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3617 const struct bpf_prog *prog)
3618 {
3619 switch (func_id) {
3620 case BPF_FUNC_setsockopt:
3621 return &bpf_sk_setsockopt_proto;
3622 case BPF_FUNC_getsockopt:
3623 return &bpf_sk_getsockopt_proto;
3624 default:
3625 return NULL;
3626 }
3627 }
3628
3629 static struct bpf_iter_reg unix_reg_info = {
3630 .target = "unix",
3631 .ctx_arg_info_size = 1,
3632 .ctx_arg_info = {
3633 { offsetof(struct bpf_iter__unix, unix_sk),
3634 PTR_TO_BTF_ID_OR_NULL },
3635 },
3636 .get_func_proto = bpf_iter_unix_get_func_proto,
3637 .seq_info = &unix_seq_info,
3638 };
3639
bpf_iter_register(void)3640 static void __init bpf_iter_register(void)
3641 {
3642 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3643 if (bpf_iter_reg_target(&unix_reg_info))
3644 pr_warn("Warning: could not register bpf iterator unix\n");
3645 }
3646 #endif
3647
af_unix_init(void)3648 static int __init af_unix_init(void)
3649 {
3650 int i, rc = -1;
3651
3652 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3653
3654 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3655 spin_lock_init(&bsd_socket_locks[i]);
3656 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3657 }
3658
3659 rc = proto_register(&unix_dgram_proto, 1);
3660 if (rc != 0) {
3661 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3662 goto out;
3663 }
3664
3665 rc = proto_register(&unix_stream_proto, 1);
3666 if (rc != 0) {
3667 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3668 proto_unregister(&unix_dgram_proto);
3669 goto out;
3670 }
3671
3672 sock_register(&unix_family_ops);
3673 register_pernet_subsys(&unix_net_ops);
3674 unix_bpf_build_proto();
3675
3676 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3677 bpf_iter_register();
3678 #endif
3679
3680 out:
3681 return rc;
3682 }
3683
af_unix_exit(void)3684 static void __exit af_unix_exit(void)
3685 {
3686 sock_unregister(PF_UNIX);
3687 proto_unregister(&unix_dgram_proto);
3688 proto_unregister(&unix_stream_proto);
3689 unregister_pernet_subsys(&unix_net_ops);
3690 }
3691
3692 /* Earlier than device_initcall() so that other drivers invoking
3693 request_module() don't end up in a loop when modprobe tries
3694 to use a UNIX socket. But later than subsys_initcall() because
3695 we depend on stuff initialised there */
3696 fs_initcall(af_unix_init);
3697 module_exit(af_unix_exit);
3698
3699 MODULE_LICENSE("GPL");
3700 MODULE_ALIAS_NETPROTO(PF_UNIX);
3701