1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  Fast Userspace Mutexes (which I call "Futexes!").
4  *  (C) Rusty Russell, IBM 2002
5  *
6  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8  *
9  *  Removed page pinning, fix privately mapped COW pages and other cleanups
10  *  (C) Copyright 2003, 2004 Jamie Lokier
11  *
12  *  Robust futex support started by Ingo Molnar
13  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15  *
16  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
17  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19  *
20  *  PRIVATE futexes by Eric Dumazet
21  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22  *
23  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24  *  Copyright (C) IBM Corporation, 2009
25  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
26  *
27  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28  *  enough at me, Linus for the original (flawed) idea, Matthew
29  *  Kirkwood for proof-of-concept implementation.
30  *
31  *  "The futexes are also cursed."
32  *  "But they come in a choice of three flavours!"
33  */
34 #include <linux/compat.h>
35 #include <linux/jhash.h>
36 #include <linux/pagemap.h>
37 #include <linux/syscalls.h>
38 #include <linux/freezer.h>
39 #include <linux/memblock.h>
40 #include <linux/fault-inject.h>
41 #include <linux/time_namespace.h>
42 
43 #include <asm/futex.h>
44 
45 #include "locking/rtmutex_common.h"
46 
47 /*
48  * READ this before attempting to hack on futexes!
49  *
50  * Basic futex operation and ordering guarantees
51  * =============================================
52  *
53  * The waiter reads the futex value in user space and calls
54  * futex_wait(). This function computes the hash bucket and acquires
55  * the hash bucket lock. After that it reads the futex user space value
56  * again and verifies that the data has not changed. If it has not changed
57  * it enqueues itself into the hash bucket, releases the hash bucket lock
58  * and schedules.
59  *
60  * The waker side modifies the user space value of the futex and calls
61  * futex_wake(). This function computes the hash bucket and acquires the
62  * hash bucket lock. Then it looks for waiters on that futex in the hash
63  * bucket and wakes them.
64  *
65  * In futex wake up scenarios where no tasks are blocked on a futex, taking
66  * the hb spinlock can be avoided and simply return. In order for this
67  * optimization to work, ordering guarantees must exist so that the waiter
68  * being added to the list is acknowledged when the list is concurrently being
69  * checked by the waker, avoiding scenarios like the following:
70  *
71  * CPU 0                               CPU 1
72  * val = *futex;
73  * sys_futex(WAIT, futex, val);
74  *   futex_wait(futex, val);
75  *   uval = *futex;
76  *                                     *futex = newval;
77  *                                     sys_futex(WAKE, futex);
78  *                                       futex_wake(futex);
79  *                                       if (queue_empty())
80  *                                         return;
81  *   if (uval == val)
82  *      lock(hash_bucket(futex));
83  *      queue();
84  *     unlock(hash_bucket(futex));
85  *     schedule();
86  *
87  * This would cause the waiter on CPU 0 to wait forever because it
88  * missed the transition of the user space value from val to newval
89  * and the waker did not find the waiter in the hash bucket queue.
90  *
91  * The correct serialization ensures that a waiter either observes
92  * the changed user space value before blocking or is woken by a
93  * concurrent waker:
94  *
95  * CPU 0                                 CPU 1
96  * val = *futex;
97  * sys_futex(WAIT, futex, val);
98  *   futex_wait(futex, val);
99  *
100  *   waiters++; (a)
101  *   smp_mb(); (A) <-- paired with -.
102  *                                  |
103  *   lock(hash_bucket(futex));      |
104  *                                  |
105  *   uval = *futex;                 |
106  *                                  |        *futex = newval;
107  *                                  |        sys_futex(WAKE, futex);
108  *                                  |          futex_wake(futex);
109  *                                  |
110  *                                  `--------> smp_mb(); (B)
111  *   if (uval == val)
112  *     queue();
113  *     unlock(hash_bucket(futex));
114  *     schedule();                         if (waiters)
115  *                                           lock(hash_bucket(futex));
116  *   else                                    wake_waiters(futex);
117  *     waiters--; (b)                        unlock(hash_bucket(futex));
118  *
119  * Where (A) orders the waiters increment and the futex value read through
120  * atomic operations (see hb_waiters_inc) and where (B) orders the write
121  * to futex and the waiters read (see hb_waiters_pending()).
122  *
123  * This yields the following case (where X:=waiters, Y:=futex):
124  *
125  *	X = Y = 0
126  *
127  *	w[X]=1		w[Y]=1
128  *	MB		MB
129  *	r[Y]=y		r[X]=x
130  *
131  * Which guarantees that x==0 && y==0 is impossible; which translates back into
132  * the guarantee that we cannot both miss the futex variable change and the
133  * enqueue.
134  *
135  * Note that a new waiter is accounted for in (a) even when it is possible that
136  * the wait call can return error, in which case we backtrack from it in (b).
137  * Refer to the comment in queue_lock().
138  *
139  * Similarly, in order to account for waiters being requeued on another
140  * address we always increment the waiters for the destination bucket before
141  * acquiring the lock. It then decrements them again  after releasing it -
142  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
143  * will do the additional required waiter count housekeeping. This is done for
144  * double_lock_hb() and double_unlock_hb(), respectively.
145  */
146 
147 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
148 #define futex_cmpxchg_enabled 1
149 #else
150 static int  __read_mostly futex_cmpxchg_enabled;
151 #endif
152 
153 /*
154  * Futex flags used to encode options to functions and preserve them across
155  * restarts.
156  */
157 #ifdef CONFIG_MMU
158 # define FLAGS_SHARED		0x01
159 #else
160 /*
161  * NOMMU does not have per process address space. Let the compiler optimize
162  * code away.
163  */
164 # define FLAGS_SHARED		0x00
165 #endif
166 #define FLAGS_CLOCKRT		0x02
167 #define FLAGS_HAS_TIMEOUT	0x04
168 
169 /*
170  * Priority Inheritance state:
171  */
172 struct futex_pi_state {
173 	/*
174 	 * list of 'owned' pi_state instances - these have to be
175 	 * cleaned up in do_exit() if the task exits prematurely:
176 	 */
177 	struct list_head list;
178 
179 	/*
180 	 * The PI object:
181 	 */
182 	struct rt_mutex_base pi_mutex;
183 
184 	struct task_struct *owner;
185 	refcount_t refcount;
186 
187 	union futex_key key;
188 } __randomize_layout;
189 
190 /**
191  * struct futex_q - The hashed futex queue entry, one per waiting task
192  * @list:		priority-sorted list of tasks waiting on this futex
193  * @task:		the task waiting on the futex
194  * @lock_ptr:		the hash bucket lock
195  * @key:		the key the futex is hashed on
196  * @pi_state:		optional priority inheritance state
197  * @rt_waiter:		rt_waiter storage for use with requeue_pi
198  * @requeue_pi_key:	the requeue_pi target futex key
199  * @bitset:		bitset for the optional bitmasked wakeup
200  * @requeue_state:	State field for futex_requeue_pi()
201  * @requeue_wait:	RCU wait for futex_requeue_pi() (RT only)
202  *
203  * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
204  * we can wake only the relevant ones (hashed queues may be shared).
205  *
206  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
207  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
208  * The order of wakeup is always to make the first condition true, then
209  * the second.
210  *
211  * PI futexes are typically woken before they are removed from the hash list via
212  * the rt_mutex code. See unqueue_me_pi().
213  */
214 struct futex_q {
215 	struct plist_node list;
216 
217 	struct task_struct *task;
218 	spinlock_t *lock_ptr;
219 	union futex_key key;
220 	struct futex_pi_state *pi_state;
221 	struct rt_mutex_waiter *rt_waiter;
222 	union futex_key *requeue_pi_key;
223 	u32 bitset;
224 	atomic_t requeue_state;
225 #ifdef CONFIG_PREEMPT_RT
226 	struct rcuwait requeue_wait;
227 #endif
228 } __randomize_layout;
229 
230 /*
231  * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
232  * underlying rtmutex. The task which is about to be requeued could have
233  * just woken up (timeout, signal). After the wake up the task has to
234  * acquire hash bucket lock, which is held by the requeue code.  As a task
235  * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
236  * and the hash bucket lock blocking would collide and corrupt state.
237  *
238  * On !PREEMPT_RT this is not a problem and everything could be serialized
239  * on hash bucket lock, but aside of having the benefit of common code,
240  * this allows to avoid doing the requeue when the task is already on the
241  * way out and taking the hash bucket lock of the original uaddr1 when the
242  * requeue has been completed.
243  *
244  * The following state transitions are valid:
245  *
246  * On the waiter side:
247  *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_IGNORE
248  *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_WAIT
249  *
250  * On the requeue side:
251  *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_INPROGRESS
252  *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_DONE/LOCKED
253  *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_NONE (requeue failed)
254  *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_DONE/LOCKED
255  *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_IGNORE (requeue failed)
256  *
257  * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
258  * signals that the waiter is already on the way out. It also means that
259  * the waiter is still on the 'wait' futex, i.e. uaddr1.
260  *
261  * The waiter side signals early wakeup to the requeue side either through
262  * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
263  * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
264  * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
265  * which means the wakeup is interleaving with a requeue in progress it has
266  * to wait for the requeue side to change the state. Either to DONE/LOCKED
267  * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
268  * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
269  * the requeue side when the requeue attempt failed via deadlock detection
270  * and therefore the waiter q is still on the uaddr1 futex.
271  */
272 enum {
273 	Q_REQUEUE_PI_NONE		=  0,
274 	Q_REQUEUE_PI_IGNORE,
275 	Q_REQUEUE_PI_IN_PROGRESS,
276 	Q_REQUEUE_PI_WAIT,
277 	Q_REQUEUE_PI_DONE,
278 	Q_REQUEUE_PI_LOCKED,
279 };
280 
281 static const struct futex_q futex_q_init = {
282 	/* list gets initialized in queue_me()*/
283 	.key		= FUTEX_KEY_INIT,
284 	.bitset		= FUTEX_BITSET_MATCH_ANY,
285 	.requeue_state	= ATOMIC_INIT(Q_REQUEUE_PI_NONE),
286 };
287 
288 /*
289  * Hash buckets are shared by all the futex_keys that hash to the same
290  * location.  Each key may have multiple futex_q structures, one for each task
291  * waiting on a futex.
292  */
293 struct futex_hash_bucket {
294 	atomic_t waiters;
295 	spinlock_t lock;
296 	struct plist_head chain;
297 } ____cacheline_aligned_in_smp;
298 
299 /*
300  * The base of the bucket array and its size are always used together
301  * (after initialization only in hash_futex()), so ensure that they
302  * reside in the same cacheline.
303  */
304 static struct {
305 	struct futex_hash_bucket *queues;
306 	unsigned long            hashsize;
307 } __futex_data __read_mostly __aligned(2*sizeof(long));
308 #define futex_queues   (__futex_data.queues)
309 #define futex_hashsize (__futex_data.hashsize)
310 
311 
312 /*
313  * Fault injections for futexes.
314  */
315 #ifdef CONFIG_FAIL_FUTEX
316 
317 static struct {
318 	struct fault_attr attr;
319 
320 	bool ignore_private;
321 } fail_futex = {
322 	.attr = FAULT_ATTR_INITIALIZER,
323 	.ignore_private = false,
324 };
325 
setup_fail_futex(char * str)326 static int __init setup_fail_futex(char *str)
327 {
328 	return setup_fault_attr(&fail_futex.attr, str);
329 }
330 __setup("fail_futex=", setup_fail_futex);
331 
should_fail_futex(bool fshared)332 static bool should_fail_futex(bool fshared)
333 {
334 	if (fail_futex.ignore_private && !fshared)
335 		return false;
336 
337 	return should_fail(&fail_futex.attr, 1);
338 }
339 
340 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
341 
fail_futex_debugfs(void)342 static int __init fail_futex_debugfs(void)
343 {
344 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
345 	struct dentry *dir;
346 
347 	dir = fault_create_debugfs_attr("fail_futex", NULL,
348 					&fail_futex.attr);
349 	if (IS_ERR(dir))
350 		return PTR_ERR(dir);
351 
352 	debugfs_create_bool("ignore-private", mode, dir,
353 			    &fail_futex.ignore_private);
354 	return 0;
355 }
356 
357 late_initcall(fail_futex_debugfs);
358 
359 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
360 
361 #else
should_fail_futex(bool fshared)362 static inline bool should_fail_futex(bool fshared)
363 {
364 	return false;
365 }
366 #endif /* CONFIG_FAIL_FUTEX */
367 
368 #ifdef CONFIG_COMPAT
369 static void compat_exit_robust_list(struct task_struct *curr);
370 #endif
371 
372 /*
373  * Reflects a new waiter being added to the waitqueue.
374  */
hb_waiters_inc(struct futex_hash_bucket * hb)375 static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
376 {
377 #ifdef CONFIG_SMP
378 	atomic_inc(&hb->waiters);
379 	/*
380 	 * Full barrier (A), see the ordering comment above.
381 	 */
382 	smp_mb__after_atomic();
383 #endif
384 }
385 
386 /*
387  * Reflects a waiter being removed from the waitqueue by wakeup
388  * paths.
389  */
hb_waiters_dec(struct futex_hash_bucket * hb)390 static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
391 {
392 #ifdef CONFIG_SMP
393 	atomic_dec(&hb->waiters);
394 #endif
395 }
396 
hb_waiters_pending(struct futex_hash_bucket * hb)397 static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
398 {
399 #ifdef CONFIG_SMP
400 	/*
401 	 * Full barrier (B), see the ordering comment above.
402 	 */
403 	smp_mb();
404 	return atomic_read(&hb->waiters);
405 #else
406 	return 1;
407 #endif
408 }
409 
410 /**
411  * hash_futex - Return the hash bucket in the global hash
412  * @key:	Pointer to the futex key for which the hash is calculated
413  *
414  * We hash on the keys returned from get_futex_key (see below) and return the
415  * corresponding hash bucket in the global hash.
416  */
hash_futex(union futex_key * key)417 static struct futex_hash_bucket *hash_futex(union futex_key *key)
418 {
419 	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
420 			  key->both.offset);
421 
422 	return &futex_queues[hash & (futex_hashsize - 1)];
423 }
424 
425 
426 /**
427  * match_futex - Check whether two futex keys are equal
428  * @key1:	Pointer to key1
429  * @key2:	Pointer to key2
430  *
431  * Return 1 if two futex_keys are equal, 0 otherwise.
432  */
match_futex(union futex_key * key1,union futex_key * key2)433 static inline int match_futex(union futex_key *key1, union futex_key *key2)
434 {
435 	return (key1 && key2
436 		&& key1->both.word == key2->both.word
437 		&& key1->both.ptr == key2->both.ptr
438 		&& key1->both.offset == key2->both.offset);
439 }
440 
441 enum futex_access {
442 	FUTEX_READ,
443 	FUTEX_WRITE
444 };
445 
446 /**
447  * futex_setup_timer - set up the sleeping hrtimer.
448  * @time:	ptr to the given timeout value
449  * @timeout:	the hrtimer_sleeper structure to be set up
450  * @flags:	futex flags
451  * @range_ns:	optional range in ns
452  *
453  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
454  *	   value given
455  */
456 static inline struct hrtimer_sleeper *
futex_setup_timer(ktime_t * time,struct hrtimer_sleeper * timeout,int flags,u64 range_ns)457 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
458 		  int flags, u64 range_ns)
459 {
460 	if (!time)
461 		return NULL;
462 
463 	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
464 				      CLOCK_REALTIME : CLOCK_MONOTONIC,
465 				      HRTIMER_MODE_ABS);
466 	/*
467 	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
468 	 * effectively the same as calling hrtimer_set_expires().
469 	 */
470 	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
471 
472 	return timeout;
473 }
474 
475 /*
476  * Generate a machine wide unique identifier for this inode.
477  *
478  * This relies on u64 not wrapping in the life-time of the machine; which with
479  * 1ns resolution means almost 585 years.
480  *
481  * This further relies on the fact that a well formed program will not unmap
482  * the file while it has a (shared) futex waiting on it. This mapping will have
483  * a file reference which pins the mount and inode.
484  *
485  * If for some reason an inode gets evicted and read back in again, it will get
486  * a new sequence number and will _NOT_ match, even though it is the exact same
487  * file.
488  *
489  * It is important that match_futex() will never have a false-positive, esp.
490  * for PI futexes that can mess up the state. The above argues that false-negatives
491  * are only possible for malformed programs.
492  */
get_inode_sequence_number(struct inode * inode)493 static u64 get_inode_sequence_number(struct inode *inode)
494 {
495 	static atomic64_t i_seq;
496 	u64 old;
497 
498 	/* Does the inode already have a sequence number? */
499 	old = atomic64_read(&inode->i_sequence);
500 	if (likely(old))
501 		return old;
502 
503 	for (;;) {
504 		u64 new = atomic64_add_return(1, &i_seq);
505 		if (WARN_ON_ONCE(!new))
506 			continue;
507 
508 		old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
509 		if (old)
510 			return old;
511 		return new;
512 	}
513 }
514 
515 /**
516  * get_futex_key() - Get parameters which are the keys for a futex
517  * @uaddr:	virtual address of the futex
518  * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
519  * @key:	address where result is stored.
520  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
521  *              FUTEX_WRITE)
522  *
523  * Return: a negative error code or 0
524  *
525  * The key words are stored in @key on success.
526  *
527  * For shared mappings (when @fshared), the key is:
528  *
529  *   ( inode->i_sequence, page->index, offset_within_page )
530  *
531  * [ also see get_inode_sequence_number() ]
532  *
533  * For private mappings (or when !@fshared), the key is:
534  *
535  *   ( current->mm, address, 0 )
536  *
537  * This allows (cross process, where applicable) identification of the futex
538  * without keeping the page pinned for the duration of the FUTEX_WAIT.
539  *
540  * lock_page() might sleep, the caller should not hold a spinlock.
541  */
get_futex_key(u32 __user * uaddr,bool fshared,union futex_key * key,enum futex_access rw)542 static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
543 			 enum futex_access rw)
544 {
545 	unsigned long address = (unsigned long)uaddr;
546 	struct mm_struct *mm = current->mm;
547 	struct page *page, *tail;
548 	struct address_space *mapping;
549 	int err, ro = 0;
550 
551 	/*
552 	 * The futex address must be "naturally" aligned.
553 	 */
554 	key->both.offset = address % PAGE_SIZE;
555 	if (unlikely((address % sizeof(u32)) != 0))
556 		return -EINVAL;
557 	address -= key->both.offset;
558 
559 	if (unlikely(!access_ok(uaddr, sizeof(u32))))
560 		return -EFAULT;
561 
562 	if (unlikely(should_fail_futex(fshared)))
563 		return -EFAULT;
564 
565 	/*
566 	 * PROCESS_PRIVATE futexes are fast.
567 	 * As the mm cannot disappear under us and the 'key' only needs
568 	 * virtual address, we dont even have to find the underlying vma.
569 	 * Note : We do have to check 'uaddr' is a valid user address,
570 	 *        but access_ok() should be faster than find_vma()
571 	 */
572 	if (!fshared) {
573 		key->private.mm = mm;
574 		key->private.address = address;
575 		return 0;
576 	}
577 
578 again:
579 	/* Ignore any VERIFY_READ mapping (futex common case) */
580 	if (unlikely(should_fail_futex(true)))
581 		return -EFAULT;
582 
583 	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
584 	/*
585 	 * If write access is not required (eg. FUTEX_WAIT), try
586 	 * and get read-only access.
587 	 */
588 	if (err == -EFAULT && rw == FUTEX_READ) {
589 		err = get_user_pages_fast(address, 1, 0, &page);
590 		ro = 1;
591 	}
592 	if (err < 0)
593 		return err;
594 	else
595 		err = 0;
596 
597 	/*
598 	 * The treatment of mapping from this point on is critical. The page
599 	 * lock protects many things but in this context the page lock
600 	 * stabilizes mapping, prevents inode freeing in the shared
601 	 * file-backed region case and guards against movement to swap cache.
602 	 *
603 	 * Strictly speaking the page lock is not needed in all cases being
604 	 * considered here and page lock forces unnecessarily serialization
605 	 * From this point on, mapping will be re-verified if necessary and
606 	 * page lock will be acquired only if it is unavoidable
607 	 *
608 	 * Mapping checks require the head page for any compound page so the
609 	 * head page and mapping is looked up now. For anonymous pages, it
610 	 * does not matter if the page splits in the future as the key is
611 	 * based on the address. For filesystem-backed pages, the tail is
612 	 * required as the index of the page determines the key. For
613 	 * base pages, there is no tail page and tail == page.
614 	 */
615 	tail = page;
616 	page = compound_head(page);
617 	mapping = READ_ONCE(page->mapping);
618 
619 	/*
620 	 * If page->mapping is NULL, then it cannot be a PageAnon
621 	 * page; but it might be the ZERO_PAGE or in the gate area or
622 	 * in a special mapping (all cases which we are happy to fail);
623 	 * or it may have been a good file page when get_user_pages_fast
624 	 * found it, but truncated or holepunched or subjected to
625 	 * invalidate_complete_page2 before we got the page lock (also
626 	 * cases which we are happy to fail).  And we hold a reference,
627 	 * so refcount care in invalidate_complete_page's remove_mapping
628 	 * prevents drop_caches from setting mapping to NULL beneath us.
629 	 *
630 	 * The case we do have to guard against is when memory pressure made
631 	 * shmem_writepage move it from filecache to swapcache beneath us:
632 	 * an unlikely race, but we do need to retry for page->mapping.
633 	 */
634 	if (unlikely(!mapping)) {
635 		int shmem_swizzled;
636 
637 		/*
638 		 * Page lock is required to identify which special case above
639 		 * applies. If this is really a shmem page then the page lock
640 		 * will prevent unexpected transitions.
641 		 */
642 		lock_page(page);
643 		shmem_swizzled = PageSwapCache(page) || page->mapping;
644 		unlock_page(page);
645 		put_page(page);
646 
647 		if (shmem_swizzled)
648 			goto again;
649 
650 		return -EFAULT;
651 	}
652 
653 	/*
654 	 * Private mappings are handled in a simple way.
655 	 *
656 	 * If the futex key is stored on an anonymous page, then the associated
657 	 * object is the mm which is implicitly pinned by the calling process.
658 	 *
659 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
660 	 * it's a read-only handle, it's expected that futexes attach to
661 	 * the object not the particular process.
662 	 */
663 	if (PageAnon(page)) {
664 		/*
665 		 * A RO anonymous page will never change and thus doesn't make
666 		 * sense for futex operations.
667 		 */
668 		if (unlikely(should_fail_futex(true)) || ro) {
669 			err = -EFAULT;
670 			goto out;
671 		}
672 
673 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
674 		key->private.mm = mm;
675 		key->private.address = address;
676 
677 	} else {
678 		struct inode *inode;
679 
680 		/*
681 		 * The associated futex object in this case is the inode and
682 		 * the page->mapping must be traversed. Ordinarily this should
683 		 * be stabilised under page lock but it's not strictly
684 		 * necessary in this case as we just want to pin the inode, not
685 		 * update the radix tree or anything like that.
686 		 *
687 		 * The RCU read lock is taken as the inode is finally freed
688 		 * under RCU. If the mapping still matches expectations then the
689 		 * mapping->host can be safely accessed as being a valid inode.
690 		 */
691 		rcu_read_lock();
692 
693 		if (READ_ONCE(page->mapping) != mapping) {
694 			rcu_read_unlock();
695 			put_page(page);
696 
697 			goto again;
698 		}
699 
700 		inode = READ_ONCE(mapping->host);
701 		if (!inode) {
702 			rcu_read_unlock();
703 			put_page(page);
704 
705 			goto again;
706 		}
707 
708 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
709 		key->shared.i_seq = get_inode_sequence_number(inode);
710 		key->shared.pgoff = page_to_pgoff(tail);
711 		rcu_read_unlock();
712 	}
713 
714 out:
715 	put_page(page);
716 	return err;
717 }
718 
719 /**
720  * fault_in_user_writeable() - Fault in user address and verify RW access
721  * @uaddr:	pointer to faulting user space address
722  *
723  * Slow path to fixup the fault we just took in the atomic write
724  * access to @uaddr.
725  *
726  * We have no generic implementation of a non-destructive write to the
727  * user address. We know that we faulted in the atomic pagefault
728  * disabled section so we can as well avoid the #PF overhead by
729  * calling get_user_pages() right away.
730  */
fault_in_user_writeable(u32 __user * uaddr)731 static int fault_in_user_writeable(u32 __user *uaddr)
732 {
733 	struct mm_struct *mm = current->mm;
734 	int ret;
735 
736 	mmap_read_lock(mm);
737 	ret = fixup_user_fault(mm, (unsigned long)uaddr,
738 			       FAULT_FLAG_WRITE, NULL);
739 	mmap_read_unlock(mm);
740 
741 	return ret < 0 ? ret : 0;
742 }
743 
744 /**
745  * futex_top_waiter() - Return the highest priority waiter on a futex
746  * @hb:		the hash bucket the futex_q's reside in
747  * @key:	the futex key (to distinguish it from other futex futex_q's)
748  *
749  * Must be called with the hb lock held.
750  */
futex_top_waiter(struct futex_hash_bucket * hb,union futex_key * key)751 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
752 					union futex_key *key)
753 {
754 	struct futex_q *this;
755 
756 	plist_for_each_entry(this, &hb->chain, list) {
757 		if (match_futex(&this->key, key))
758 			return this;
759 	}
760 	return NULL;
761 }
762 
cmpxchg_futex_value_locked(u32 * curval,u32 __user * uaddr,u32 uval,u32 newval)763 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
764 				      u32 uval, u32 newval)
765 {
766 	int ret;
767 
768 	pagefault_disable();
769 	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
770 	pagefault_enable();
771 
772 	return ret;
773 }
774 
get_futex_value_locked(u32 * dest,u32 __user * from)775 static int get_futex_value_locked(u32 *dest, u32 __user *from)
776 {
777 	int ret;
778 
779 	pagefault_disable();
780 	ret = __get_user(*dest, from);
781 	pagefault_enable();
782 
783 	return ret ? -EFAULT : 0;
784 }
785 
786 
787 /*
788  * PI code:
789  */
refill_pi_state_cache(void)790 static int refill_pi_state_cache(void)
791 {
792 	struct futex_pi_state *pi_state;
793 
794 	if (likely(current->pi_state_cache))
795 		return 0;
796 
797 	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
798 
799 	if (!pi_state)
800 		return -ENOMEM;
801 
802 	INIT_LIST_HEAD(&pi_state->list);
803 	/* pi_mutex gets initialized later */
804 	pi_state->owner = NULL;
805 	refcount_set(&pi_state->refcount, 1);
806 	pi_state->key = FUTEX_KEY_INIT;
807 
808 	current->pi_state_cache = pi_state;
809 
810 	return 0;
811 }
812 
alloc_pi_state(void)813 static struct futex_pi_state *alloc_pi_state(void)
814 {
815 	struct futex_pi_state *pi_state = current->pi_state_cache;
816 
817 	WARN_ON(!pi_state);
818 	current->pi_state_cache = NULL;
819 
820 	return pi_state;
821 }
822 
pi_state_update_owner(struct futex_pi_state * pi_state,struct task_struct * new_owner)823 static void pi_state_update_owner(struct futex_pi_state *pi_state,
824 				  struct task_struct *new_owner)
825 {
826 	struct task_struct *old_owner = pi_state->owner;
827 
828 	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
829 
830 	if (old_owner) {
831 		raw_spin_lock(&old_owner->pi_lock);
832 		WARN_ON(list_empty(&pi_state->list));
833 		list_del_init(&pi_state->list);
834 		raw_spin_unlock(&old_owner->pi_lock);
835 	}
836 
837 	if (new_owner) {
838 		raw_spin_lock(&new_owner->pi_lock);
839 		WARN_ON(!list_empty(&pi_state->list));
840 		list_add(&pi_state->list, &new_owner->pi_state_list);
841 		pi_state->owner = new_owner;
842 		raw_spin_unlock(&new_owner->pi_lock);
843 	}
844 }
845 
get_pi_state(struct futex_pi_state * pi_state)846 static void get_pi_state(struct futex_pi_state *pi_state)
847 {
848 	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
849 }
850 
851 /*
852  * Drops a reference to the pi_state object and frees or caches it
853  * when the last reference is gone.
854  */
put_pi_state(struct futex_pi_state * pi_state)855 static void put_pi_state(struct futex_pi_state *pi_state)
856 {
857 	if (!pi_state)
858 		return;
859 
860 	if (!refcount_dec_and_test(&pi_state->refcount))
861 		return;
862 
863 	/*
864 	 * If pi_state->owner is NULL, the owner is most probably dying
865 	 * and has cleaned up the pi_state already
866 	 */
867 	if (pi_state->owner) {
868 		unsigned long flags;
869 
870 		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
871 		pi_state_update_owner(pi_state, NULL);
872 		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
873 		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
874 	}
875 
876 	if (current->pi_state_cache) {
877 		kfree(pi_state);
878 	} else {
879 		/*
880 		 * pi_state->list is already empty.
881 		 * clear pi_state->owner.
882 		 * refcount is at 0 - put it back to 1.
883 		 */
884 		pi_state->owner = NULL;
885 		refcount_set(&pi_state->refcount, 1);
886 		current->pi_state_cache = pi_state;
887 	}
888 }
889 
890 #ifdef CONFIG_FUTEX_PI
891 
892 /*
893  * This task is holding PI mutexes at exit time => bad.
894  * Kernel cleans up PI-state, but userspace is likely hosed.
895  * (Robust-futex cleanup is separate and might save the day for userspace.)
896  */
exit_pi_state_list(struct task_struct * curr)897 static void exit_pi_state_list(struct task_struct *curr)
898 {
899 	struct list_head *next, *head = &curr->pi_state_list;
900 	struct futex_pi_state *pi_state;
901 	struct futex_hash_bucket *hb;
902 	union futex_key key = FUTEX_KEY_INIT;
903 
904 	if (!futex_cmpxchg_enabled)
905 		return;
906 	/*
907 	 * We are a ZOMBIE and nobody can enqueue itself on
908 	 * pi_state_list anymore, but we have to be careful
909 	 * versus waiters unqueueing themselves:
910 	 */
911 	raw_spin_lock_irq(&curr->pi_lock);
912 	while (!list_empty(head)) {
913 		next = head->next;
914 		pi_state = list_entry(next, struct futex_pi_state, list);
915 		key = pi_state->key;
916 		hb = hash_futex(&key);
917 
918 		/*
919 		 * We can race against put_pi_state() removing itself from the
920 		 * list (a waiter going away). put_pi_state() will first
921 		 * decrement the reference count and then modify the list, so
922 		 * its possible to see the list entry but fail this reference
923 		 * acquire.
924 		 *
925 		 * In that case; drop the locks to let put_pi_state() make
926 		 * progress and retry the loop.
927 		 */
928 		if (!refcount_inc_not_zero(&pi_state->refcount)) {
929 			raw_spin_unlock_irq(&curr->pi_lock);
930 			cpu_relax();
931 			raw_spin_lock_irq(&curr->pi_lock);
932 			continue;
933 		}
934 		raw_spin_unlock_irq(&curr->pi_lock);
935 
936 		spin_lock(&hb->lock);
937 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
938 		raw_spin_lock(&curr->pi_lock);
939 		/*
940 		 * We dropped the pi-lock, so re-check whether this
941 		 * task still owns the PI-state:
942 		 */
943 		if (head->next != next) {
944 			/* retain curr->pi_lock for the loop invariant */
945 			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
946 			spin_unlock(&hb->lock);
947 			put_pi_state(pi_state);
948 			continue;
949 		}
950 
951 		WARN_ON(pi_state->owner != curr);
952 		WARN_ON(list_empty(&pi_state->list));
953 		list_del_init(&pi_state->list);
954 		pi_state->owner = NULL;
955 
956 		raw_spin_unlock(&curr->pi_lock);
957 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
958 		spin_unlock(&hb->lock);
959 
960 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
961 		put_pi_state(pi_state);
962 
963 		raw_spin_lock_irq(&curr->pi_lock);
964 	}
965 	raw_spin_unlock_irq(&curr->pi_lock);
966 }
967 #else
exit_pi_state_list(struct task_struct * curr)968 static inline void exit_pi_state_list(struct task_struct *curr) { }
969 #endif
970 
971 /*
972  * We need to check the following states:
973  *
974  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
975  *
976  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
977  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
978  *
979  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
980  *
981  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
982  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
983  *
984  * [6]  Found  | Found    | task      | 0         | 1      | Valid
985  *
986  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
987  *
988  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
989  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
990  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
991  *
992  * [1]	Indicates that the kernel can acquire the futex atomically. We
993  *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
994  *
995  * [2]	Valid, if TID does not belong to a kernel thread. If no matching
996  *      thread is found then it indicates that the owner TID has died.
997  *
998  * [3]	Invalid. The waiter is queued on a non PI futex
999  *
1000  * [4]	Valid state after exit_robust_list(), which sets the user space
1001  *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
1002  *
1003  * [5]	The user space value got manipulated between exit_robust_list()
1004  *	and exit_pi_state_list()
1005  *
1006  * [6]	Valid state after exit_pi_state_list() which sets the new owner in
1007  *	the pi_state but cannot access the user space value.
1008  *
1009  * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
1010  *
1011  * [8]	Owner and user space value match
1012  *
1013  * [9]	There is no transient state which sets the user space TID to 0
1014  *	except exit_robust_list(), but this is indicated by the
1015  *	FUTEX_OWNER_DIED bit. See [4]
1016  *
1017  * [10] There is no transient state which leaves owner and user space
1018  *	TID out of sync. Except one error case where the kernel is denied
1019  *	write access to the user address, see fixup_pi_state_owner().
1020  *
1021  *
1022  * Serialization and lifetime rules:
1023  *
1024  * hb->lock:
1025  *
1026  *	hb -> futex_q, relation
1027  *	futex_q -> pi_state, relation
1028  *
1029  *	(cannot be raw because hb can contain arbitrary amount
1030  *	 of futex_q's)
1031  *
1032  * pi_mutex->wait_lock:
1033  *
1034  *	{uval, pi_state}
1035  *
1036  *	(and pi_mutex 'obviously')
1037  *
1038  * p->pi_lock:
1039  *
1040  *	p->pi_state_list -> pi_state->list, relation
1041  *	pi_mutex->owner -> pi_state->owner, relation
1042  *
1043  * pi_state->refcount:
1044  *
1045  *	pi_state lifetime
1046  *
1047  *
1048  * Lock order:
1049  *
1050  *   hb->lock
1051  *     pi_mutex->wait_lock
1052  *       p->pi_lock
1053  *
1054  */
1055 
1056 /*
1057  * Validate that the existing waiter has a pi_state and sanity check
1058  * the pi_state against the user space value. If correct, attach to
1059  * it.
1060  */
attach_to_pi_state(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state,struct futex_pi_state ** ps)1061 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1062 			      struct futex_pi_state *pi_state,
1063 			      struct futex_pi_state **ps)
1064 {
1065 	pid_t pid = uval & FUTEX_TID_MASK;
1066 	u32 uval2;
1067 	int ret;
1068 
1069 	/*
1070 	 * Userspace might have messed up non-PI and PI futexes [3]
1071 	 */
1072 	if (unlikely(!pi_state))
1073 		return -EINVAL;
1074 
1075 	/*
1076 	 * We get here with hb->lock held, and having found a
1077 	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1078 	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1079 	 * which in turn means that futex_lock_pi() still has a reference on
1080 	 * our pi_state.
1081 	 *
1082 	 * The waiter holding a reference on @pi_state also protects against
1083 	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1084 	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1085 	 * free pi_state before we can take a reference ourselves.
1086 	 */
1087 	WARN_ON(!refcount_read(&pi_state->refcount));
1088 
1089 	/*
1090 	 * Now that we have a pi_state, we can acquire wait_lock
1091 	 * and do the state validation.
1092 	 */
1093 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1094 
1095 	/*
1096 	 * Since {uval, pi_state} is serialized by wait_lock, and our current
1097 	 * uval was read without holding it, it can have changed. Verify it
1098 	 * still is what we expect it to be, otherwise retry the entire
1099 	 * operation.
1100 	 */
1101 	if (get_futex_value_locked(&uval2, uaddr))
1102 		goto out_efault;
1103 
1104 	if (uval != uval2)
1105 		goto out_eagain;
1106 
1107 	/*
1108 	 * Handle the owner died case:
1109 	 */
1110 	if (uval & FUTEX_OWNER_DIED) {
1111 		/*
1112 		 * exit_pi_state_list sets owner to NULL and wakes the
1113 		 * topmost waiter. The task which acquires the
1114 		 * pi_state->rt_mutex will fixup owner.
1115 		 */
1116 		if (!pi_state->owner) {
1117 			/*
1118 			 * No pi state owner, but the user space TID
1119 			 * is not 0. Inconsistent state. [5]
1120 			 */
1121 			if (pid)
1122 				goto out_einval;
1123 			/*
1124 			 * Take a ref on the state and return success. [4]
1125 			 */
1126 			goto out_attach;
1127 		}
1128 
1129 		/*
1130 		 * If TID is 0, then either the dying owner has not
1131 		 * yet executed exit_pi_state_list() or some waiter
1132 		 * acquired the rtmutex in the pi state, but did not
1133 		 * yet fixup the TID in user space.
1134 		 *
1135 		 * Take a ref on the state and return success. [6]
1136 		 */
1137 		if (!pid)
1138 			goto out_attach;
1139 	} else {
1140 		/*
1141 		 * If the owner died bit is not set, then the pi_state
1142 		 * must have an owner. [7]
1143 		 */
1144 		if (!pi_state->owner)
1145 			goto out_einval;
1146 	}
1147 
1148 	/*
1149 	 * Bail out if user space manipulated the futex value. If pi
1150 	 * state exists then the owner TID must be the same as the
1151 	 * user space TID. [9/10]
1152 	 */
1153 	if (pid != task_pid_vnr(pi_state->owner))
1154 		goto out_einval;
1155 
1156 out_attach:
1157 	get_pi_state(pi_state);
1158 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1159 	*ps = pi_state;
1160 	return 0;
1161 
1162 out_einval:
1163 	ret = -EINVAL;
1164 	goto out_error;
1165 
1166 out_eagain:
1167 	ret = -EAGAIN;
1168 	goto out_error;
1169 
1170 out_efault:
1171 	ret = -EFAULT;
1172 	goto out_error;
1173 
1174 out_error:
1175 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1176 	return ret;
1177 }
1178 
1179 /**
1180  * wait_for_owner_exiting - Block until the owner has exited
1181  * @ret: owner's current futex lock status
1182  * @exiting:	Pointer to the exiting task
1183  *
1184  * Caller must hold a refcount on @exiting.
1185  */
wait_for_owner_exiting(int ret,struct task_struct * exiting)1186 static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1187 {
1188 	if (ret != -EBUSY) {
1189 		WARN_ON_ONCE(exiting);
1190 		return;
1191 	}
1192 
1193 	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1194 		return;
1195 
1196 	mutex_lock(&exiting->futex_exit_mutex);
1197 	/*
1198 	 * No point in doing state checking here. If the waiter got here
1199 	 * while the task was in exec()->exec_futex_release() then it can
1200 	 * have any FUTEX_STATE_* value when the waiter has acquired the
1201 	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
1202 	 * already. Highly unlikely and not a problem. Just one more round
1203 	 * through the futex maze.
1204 	 */
1205 	mutex_unlock(&exiting->futex_exit_mutex);
1206 
1207 	put_task_struct(exiting);
1208 }
1209 
handle_exit_race(u32 __user * uaddr,u32 uval,struct task_struct * tsk)1210 static int handle_exit_race(u32 __user *uaddr, u32 uval,
1211 			    struct task_struct *tsk)
1212 {
1213 	u32 uval2;
1214 
1215 	/*
1216 	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1217 	 * caller that the alleged owner is busy.
1218 	 */
1219 	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1220 		return -EBUSY;
1221 
1222 	/*
1223 	 * Reread the user space value to handle the following situation:
1224 	 *
1225 	 * CPU0				CPU1
1226 	 *
1227 	 * sys_exit()			sys_futex()
1228 	 *  do_exit()			 futex_lock_pi()
1229 	 *                                futex_lock_pi_atomic()
1230 	 *   exit_signals(tsk)		    No waiters:
1231 	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
1232 	 *  mm_release(tsk)		    Set waiter bit
1233 	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
1234 	 *      Set owner died		    attach_to_pi_owner() {
1235 	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
1236 	 *   }				     if (!tsk->flags & PF_EXITING) {
1237 	 *  ...				       attach();
1238 	 *  tsk->futex_state =               } else {
1239 	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
1240 	 *					  FUTEX_STATE_DEAD)
1241 	 *				         return -EAGAIN;
1242 	 *				       return -ESRCH; <--- FAIL
1243 	 *				     }
1244 	 *
1245 	 * Returning ESRCH unconditionally is wrong here because the
1246 	 * user space value has been changed by the exiting task.
1247 	 *
1248 	 * The same logic applies to the case where the exiting task is
1249 	 * already gone.
1250 	 */
1251 	if (get_futex_value_locked(&uval2, uaddr))
1252 		return -EFAULT;
1253 
1254 	/* If the user space value has changed, try again. */
1255 	if (uval2 != uval)
1256 		return -EAGAIN;
1257 
1258 	/*
1259 	 * The exiting task did not have a robust list, the robust list was
1260 	 * corrupted or the user space value in *uaddr is simply bogus.
1261 	 * Give up and tell user space.
1262 	 */
1263 	return -ESRCH;
1264 }
1265 
__attach_to_pi_owner(struct task_struct * p,union futex_key * key,struct futex_pi_state ** ps)1266 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
1267 				 struct futex_pi_state **ps)
1268 {
1269 	/*
1270 	 * No existing pi state. First waiter. [2]
1271 	 *
1272 	 * This creates pi_state, we have hb->lock held, this means nothing can
1273 	 * observe this state, wait_lock is irrelevant.
1274 	 */
1275 	struct futex_pi_state *pi_state = alloc_pi_state();
1276 
1277 	/*
1278 	 * Initialize the pi_mutex in locked state and make @p
1279 	 * the owner of it:
1280 	 */
1281 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1282 
1283 	/* Store the key for possible exit cleanups: */
1284 	pi_state->key = *key;
1285 
1286 	WARN_ON(!list_empty(&pi_state->list));
1287 	list_add(&pi_state->list, &p->pi_state_list);
1288 	/*
1289 	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1290 	 * because there is no concurrency as the object is not published yet.
1291 	 */
1292 	pi_state->owner = p;
1293 
1294 	*ps = pi_state;
1295 }
1296 /*
1297  * Lookup the task for the TID provided from user space and attach to
1298  * it after doing proper sanity checks.
1299  */
attach_to_pi_owner(u32 __user * uaddr,u32 uval,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)1300 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1301 			      struct futex_pi_state **ps,
1302 			      struct task_struct **exiting)
1303 {
1304 	pid_t pid = uval & FUTEX_TID_MASK;
1305 	struct task_struct *p;
1306 
1307 	/*
1308 	 * We are the first waiter - try to look up the real owner and attach
1309 	 * the new pi_state to it, but bail out when TID = 0 [1]
1310 	 *
1311 	 * The !pid check is paranoid. None of the call sites should end up
1312 	 * with pid == 0, but better safe than sorry. Let the caller retry
1313 	 */
1314 	if (!pid)
1315 		return -EAGAIN;
1316 	p = find_get_task_by_vpid(pid);
1317 	if (!p)
1318 		return handle_exit_race(uaddr, uval, NULL);
1319 
1320 	if (unlikely(p->flags & PF_KTHREAD)) {
1321 		put_task_struct(p);
1322 		return -EPERM;
1323 	}
1324 
1325 	/*
1326 	 * We need to look at the task state to figure out, whether the
1327 	 * task is exiting. To protect against the change of the task state
1328 	 * in futex_exit_release(), we do this protected by p->pi_lock:
1329 	 */
1330 	raw_spin_lock_irq(&p->pi_lock);
1331 	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1332 		/*
1333 		 * The task is on the way out. When the futex state is
1334 		 * FUTEX_STATE_DEAD, we know that the task has finished
1335 		 * the cleanup:
1336 		 */
1337 		int ret = handle_exit_race(uaddr, uval, p);
1338 
1339 		raw_spin_unlock_irq(&p->pi_lock);
1340 		/*
1341 		 * If the owner task is between FUTEX_STATE_EXITING and
1342 		 * FUTEX_STATE_DEAD then store the task pointer and keep
1343 		 * the reference on the task struct. The calling code will
1344 		 * drop all locks, wait for the task to reach
1345 		 * FUTEX_STATE_DEAD and then drop the refcount. This is
1346 		 * required to prevent a live lock when the current task
1347 		 * preempted the exiting task between the two states.
1348 		 */
1349 		if (ret == -EBUSY)
1350 			*exiting = p;
1351 		else
1352 			put_task_struct(p);
1353 		return ret;
1354 	}
1355 
1356 	__attach_to_pi_owner(p, key, ps);
1357 	raw_spin_unlock_irq(&p->pi_lock);
1358 
1359 	put_task_struct(p);
1360 
1361 	return 0;
1362 }
1363 
lock_pi_update_atomic(u32 __user * uaddr,u32 uval,u32 newval)1364 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1365 {
1366 	int err;
1367 	u32 curval;
1368 
1369 	if (unlikely(should_fail_futex(true)))
1370 		return -EFAULT;
1371 
1372 	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1373 	if (unlikely(err))
1374 		return err;
1375 
1376 	/* If user space value changed, let the caller retry */
1377 	return curval != uval ? -EAGAIN : 0;
1378 }
1379 
1380 /**
1381  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1382  * @uaddr:		the pi futex user address
1383  * @hb:			the pi futex hash bucket
1384  * @key:		the futex key associated with uaddr and hb
1385  * @ps:			the pi_state pointer where we store the result of the
1386  *			lookup
1387  * @task:		the task to perform the atomic lock work for.  This will
1388  *			be "current" except in the case of requeue pi.
1389  * @exiting:		Pointer to store the task pointer of the owner task
1390  *			which is in the middle of exiting
1391  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1392  *
1393  * Return:
1394  *  -  0 - ready to wait;
1395  *  -  1 - acquired the lock;
1396  *  - <0 - error
1397  *
1398  * The hb->lock must be held by the caller.
1399  *
1400  * @exiting is only set when the return value is -EBUSY. If so, this holds
1401  * a refcount on the exiting task on return and the caller needs to drop it
1402  * after waiting for the exit to complete.
1403  */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,struct task_struct ** exiting,int set_waiters)1404 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1405 				union futex_key *key,
1406 				struct futex_pi_state **ps,
1407 				struct task_struct *task,
1408 				struct task_struct **exiting,
1409 				int set_waiters)
1410 {
1411 	u32 uval, newval, vpid = task_pid_vnr(task);
1412 	struct futex_q *top_waiter;
1413 	int ret;
1414 
1415 	/*
1416 	 * Read the user space value first so we can validate a few
1417 	 * things before proceeding further.
1418 	 */
1419 	if (get_futex_value_locked(&uval, uaddr))
1420 		return -EFAULT;
1421 
1422 	if (unlikely(should_fail_futex(true)))
1423 		return -EFAULT;
1424 
1425 	/*
1426 	 * Detect deadlocks.
1427 	 */
1428 	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1429 		return -EDEADLK;
1430 
1431 	if ((unlikely(should_fail_futex(true))))
1432 		return -EDEADLK;
1433 
1434 	/*
1435 	 * Lookup existing state first. If it exists, try to attach to
1436 	 * its pi_state.
1437 	 */
1438 	top_waiter = futex_top_waiter(hb, key);
1439 	if (top_waiter)
1440 		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1441 
1442 	/*
1443 	 * No waiter and user TID is 0. We are here because the
1444 	 * waiters or the owner died bit is set or called from
1445 	 * requeue_cmp_pi or for whatever reason something took the
1446 	 * syscall.
1447 	 */
1448 	if (!(uval & FUTEX_TID_MASK)) {
1449 		/*
1450 		 * We take over the futex. No other waiters and the user space
1451 		 * TID is 0. We preserve the owner died bit.
1452 		 */
1453 		newval = uval & FUTEX_OWNER_DIED;
1454 		newval |= vpid;
1455 
1456 		/* The futex requeue_pi code can enforce the waiters bit */
1457 		if (set_waiters)
1458 			newval |= FUTEX_WAITERS;
1459 
1460 		ret = lock_pi_update_atomic(uaddr, uval, newval);
1461 		if (ret)
1462 			return ret;
1463 
1464 		/*
1465 		 * If the waiter bit was requested the caller also needs PI
1466 		 * state attached to the new owner of the user space futex.
1467 		 *
1468 		 * @task is guaranteed to be alive and it cannot be exiting
1469 		 * because it is either sleeping or waiting in
1470 		 * futex_requeue_pi_wakeup_sync().
1471 		 *
1472 		 * No need to do the full attach_to_pi_owner() exercise
1473 		 * because @task is known and valid.
1474 		 */
1475 		if (set_waiters) {
1476 			raw_spin_lock_irq(&task->pi_lock);
1477 			__attach_to_pi_owner(task, key, ps);
1478 			raw_spin_unlock_irq(&task->pi_lock);
1479 		}
1480 		return 1;
1481 	}
1482 
1483 	/*
1484 	 * First waiter. Set the waiters bit before attaching ourself to
1485 	 * the owner. If owner tries to unlock, it will be forced into
1486 	 * the kernel and blocked on hb->lock.
1487 	 */
1488 	newval = uval | FUTEX_WAITERS;
1489 	ret = lock_pi_update_atomic(uaddr, uval, newval);
1490 	if (ret)
1491 		return ret;
1492 	/*
1493 	 * If the update of the user space value succeeded, we try to
1494 	 * attach to the owner. If that fails, no harm done, we only
1495 	 * set the FUTEX_WAITERS bit in the user space variable.
1496 	 */
1497 	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1498 }
1499 
1500 /**
1501  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1502  * @q:	The futex_q to unqueue
1503  *
1504  * The q->lock_ptr must not be NULL and must be held by the caller.
1505  */
__unqueue_futex(struct futex_q * q)1506 static void __unqueue_futex(struct futex_q *q)
1507 {
1508 	struct futex_hash_bucket *hb;
1509 
1510 	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
1511 		return;
1512 	lockdep_assert_held(q->lock_ptr);
1513 
1514 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1515 	plist_del(&q->list, &hb->chain);
1516 	hb_waiters_dec(hb);
1517 }
1518 
1519 /*
1520  * The hash bucket lock must be held when this is called.
1521  * Afterwards, the futex_q must not be accessed. Callers
1522  * must ensure to later call wake_up_q() for the actual
1523  * wakeups to occur.
1524  */
mark_wake_futex(struct wake_q_head * wake_q,struct futex_q * q)1525 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1526 {
1527 	struct task_struct *p = q->task;
1528 
1529 	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1530 		return;
1531 
1532 	get_task_struct(p);
1533 	__unqueue_futex(q);
1534 	/*
1535 	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1536 	 * is written, without taking any locks. This is possible in the event
1537 	 * of a spurious wakeup, for example. A memory barrier is required here
1538 	 * to prevent the following store to lock_ptr from getting ahead of the
1539 	 * plist_del in __unqueue_futex().
1540 	 */
1541 	smp_store_release(&q->lock_ptr, NULL);
1542 
1543 	/*
1544 	 * Queue the task for later wakeup for after we've released
1545 	 * the hb->lock.
1546 	 */
1547 	wake_q_add_safe(wake_q, p);
1548 }
1549 
1550 /*
1551  * Caller must hold a reference on @pi_state.
1552  */
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state)1553 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1554 {
1555 	struct rt_mutex_waiter *top_waiter;
1556 	struct task_struct *new_owner;
1557 	bool postunlock = false;
1558 	DEFINE_RT_WAKE_Q(wqh);
1559 	u32 curval, newval;
1560 	int ret = 0;
1561 
1562 	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1563 	if (WARN_ON_ONCE(!top_waiter)) {
1564 		/*
1565 		 * As per the comment in futex_unlock_pi() this should not happen.
1566 		 *
1567 		 * When this happens, give up our locks and try again, giving
1568 		 * the futex_lock_pi() instance time to complete, either by
1569 		 * waiting on the rtmutex or removing itself from the futex
1570 		 * queue.
1571 		 */
1572 		ret = -EAGAIN;
1573 		goto out_unlock;
1574 	}
1575 
1576 	new_owner = top_waiter->task;
1577 
1578 	/*
1579 	 * We pass it to the next owner. The WAITERS bit is always kept
1580 	 * enabled while there is PI state around. We cleanup the owner
1581 	 * died bit, because we are the owner.
1582 	 */
1583 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1584 
1585 	if (unlikely(should_fail_futex(true))) {
1586 		ret = -EFAULT;
1587 		goto out_unlock;
1588 	}
1589 
1590 	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1591 	if (!ret && (curval != uval)) {
1592 		/*
1593 		 * If a unconditional UNLOCK_PI operation (user space did not
1594 		 * try the TID->0 transition) raced with a waiter setting the
1595 		 * FUTEX_WAITERS flag between get_user() and locking the hash
1596 		 * bucket lock, retry the operation.
1597 		 */
1598 		if ((FUTEX_TID_MASK & curval) == uval)
1599 			ret = -EAGAIN;
1600 		else
1601 			ret = -EINVAL;
1602 	}
1603 
1604 	if (!ret) {
1605 		/*
1606 		 * This is a point of no return; once we modified the uval
1607 		 * there is no going back and subsequent operations must
1608 		 * not fail.
1609 		 */
1610 		pi_state_update_owner(pi_state, new_owner);
1611 		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
1612 	}
1613 
1614 out_unlock:
1615 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1616 
1617 	if (postunlock)
1618 		rt_mutex_postunlock(&wqh);
1619 
1620 	return ret;
1621 }
1622 
1623 /*
1624  * Express the locking dependencies for lockdep:
1625  */
1626 static inline void
double_lock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1627 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1628 {
1629 	if (hb1 <= hb2) {
1630 		spin_lock(&hb1->lock);
1631 		if (hb1 < hb2)
1632 			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1633 	} else { /* hb1 > hb2 */
1634 		spin_lock(&hb2->lock);
1635 		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1636 	}
1637 }
1638 
1639 static inline void
double_unlock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1640 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1641 {
1642 	spin_unlock(&hb1->lock);
1643 	if (hb1 != hb2)
1644 		spin_unlock(&hb2->lock);
1645 }
1646 
1647 /*
1648  * Wake up waiters matching bitset queued on this futex (uaddr).
1649  */
1650 static int
futex_wake(u32 __user * uaddr,unsigned int flags,int nr_wake,u32 bitset)1651 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1652 {
1653 	struct futex_hash_bucket *hb;
1654 	struct futex_q *this, *next;
1655 	union futex_key key = FUTEX_KEY_INIT;
1656 	int ret;
1657 	DEFINE_WAKE_Q(wake_q);
1658 
1659 	if (!bitset)
1660 		return -EINVAL;
1661 
1662 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
1663 	if (unlikely(ret != 0))
1664 		return ret;
1665 
1666 	hb = hash_futex(&key);
1667 
1668 	/* Make sure we really have tasks to wakeup */
1669 	if (!hb_waiters_pending(hb))
1670 		return ret;
1671 
1672 	spin_lock(&hb->lock);
1673 
1674 	plist_for_each_entry_safe(this, next, &hb->chain, list) {
1675 		if (match_futex (&this->key, &key)) {
1676 			if (this->pi_state || this->rt_waiter) {
1677 				ret = -EINVAL;
1678 				break;
1679 			}
1680 
1681 			/* Check if one of the bits is set in both bitsets */
1682 			if (!(this->bitset & bitset))
1683 				continue;
1684 
1685 			mark_wake_futex(&wake_q, this);
1686 			if (++ret >= nr_wake)
1687 				break;
1688 		}
1689 	}
1690 
1691 	spin_unlock(&hb->lock);
1692 	wake_up_q(&wake_q);
1693 	return ret;
1694 }
1695 
futex_atomic_op_inuser(unsigned int encoded_op,u32 __user * uaddr)1696 static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1697 {
1698 	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
1699 	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
1700 	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1701 	int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1702 	int oldval, ret;
1703 
1704 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1705 		if (oparg < 0 || oparg > 31) {
1706 			char comm[sizeof(current->comm)];
1707 			/*
1708 			 * kill this print and return -EINVAL when userspace
1709 			 * is sane again
1710 			 */
1711 			pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1712 					get_task_comm(comm, current), oparg);
1713 			oparg &= 31;
1714 		}
1715 		oparg = 1 << oparg;
1716 	}
1717 
1718 	pagefault_disable();
1719 	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1720 	pagefault_enable();
1721 	if (ret)
1722 		return ret;
1723 
1724 	switch (cmp) {
1725 	case FUTEX_OP_CMP_EQ:
1726 		return oldval == cmparg;
1727 	case FUTEX_OP_CMP_NE:
1728 		return oldval != cmparg;
1729 	case FUTEX_OP_CMP_LT:
1730 		return oldval < cmparg;
1731 	case FUTEX_OP_CMP_GE:
1732 		return oldval >= cmparg;
1733 	case FUTEX_OP_CMP_LE:
1734 		return oldval <= cmparg;
1735 	case FUTEX_OP_CMP_GT:
1736 		return oldval > cmparg;
1737 	default:
1738 		return -ENOSYS;
1739 	}
1740 }
1741 
1742 /*
1743  * Wake up all waiters hashed on the physical page that is mapped
1744  * to this virtual address:
1745  */
1746 static int
futex_wake_op(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_wake2,int op)1747 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1748 	      int nr_wake, int nr_wake2, int op)
1749 {
1750 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1751 	struct futex_hash_bucket *hb1, *hb2;
1752 	struct futex_q *this, *next;
1753 	int ret, op_ret;
1754 	DEFINE_WAKE_Q(wake_q);
1755 
1756 retry:
1757 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1758 	if (unlikely(ret != 0))
1759 		return ret;
1760 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
1761 	if (unlikely(ret != 0))
1762 		return ret;
1763 
1764 	hb1 = hash_futex(&key1);
1765 	hb2 = hash_futex(&key2);
1766 
1767 retry_private:
1768 	double_lock_hb(hb1, hb2);
1769 	op_ret = futex_atomic_op_inuser(op, uaddr2);
1770 	if (unlikely(op_ret < 0)) {
1771 		double_unlock_hb(hb1, hb2);
1772 
1773 		if (!IS_ENABLED(CONFIG_MMU) ||
1774 		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1775 			/*
1776 			 * we don't get EFAULT from MMU faults if we don't have
1777 			 * an MMU, but we might get them from range checking
1778 			 */
1779 			ret = op_ret;
1780 			return ret;
1781 		}
1782 
1783 		if (op_ret == -EFAULT) {
1784 			ret = fault_in_user_writeable(uaddr2);
1785 			if (ret)
1786 				return ret;
1787 		}
1788 
1789 		cond_resched();
1790 		if (!(flags & FLAGS_SHARED))
1791 			goto retry_private;
1792 		goto retry;
1793 	}
1794 
1795 	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1796 		if (match_futex (&this->key, &key1)) {
1797 			if (this->pi_state || this->rt_waiter) {
1798 				ret = -EINVAL;
1799 				goto out_unlock;
1800 			}
1801 			mark_wake_futex(&wake_q, this);
1802 			if (++ret >= nr_wake)
1803 				break;
1804 		}
1805 	}
1806 
1807 	if (op_ret > 0) {
1808 		op_ret = 0;
1809 		plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1810 			if (match_futex (&this->key, &key2)) {
1811 				if (this->pi_state || this->rt_waiter) {
1812 					ret = -EINVAL;
1813 					goto out_unlock;
1814 				}
1815 				mark_wake_futex(&wake_q, this);
1816 				if (++op_ret >= nr_wake2)
1817 					break;
1818 			}
1819 		}
1820 		ret += op_ret;
1821 	}
1822 
1823 out_unlock:
1824 	double_unlock_hb(hb1, hb2);
1825 	wake_up_q(&wake_q);
1826 	return ret;
1827 }
1828 
1829 /**
1830  * requeue_futex() - Requeue a futex_q from one hb to another
1831  * @q:		the futex_q to requeue
1832  * @hb1:	the source hash_bucket
1833  * @hb2:	the target hash_bucket
1834  * @key2:	the new key for the requeued futex_q
1835  */
1836 static inline
requeue_futex(struct futex_q * q,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key2)1837 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1838 		   struct futex_hash_bucket *hb2, union futex_key *key2)
1839 {
1840 
1841 	/*
1842 	 * If key1 and key2 hash to the same bucket, no need to
1843 	 * requeue.
1844 	 */
1845 	if (likely(&hb1->chain != &hb2->chain)) {
1846 		plist_del(&q->list, &hb1->chain);
1847 		hb_waiters_dec(hb1);
1848 		hb_waiters_inc(hb2);
1849 		plist_add(&q->list, &hb2->chain);
1850 		q->lock_ptr = &hb2->lock;
1851 	}
1852 	q->key = *key2;
1853 }
1854 
futex_requeue_pi_prepare(struct futex_q * q,struct futex_pi_state * pi_state)1855 static inline bool futex_requeue_pi_prepare(struct futex_q *q,
1856 					    struct futex_pi_state *pi_state)
1857 {
1858 	int old, new;
1859 
1860 	/*
1861 	 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
1862 	 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
1863 	 * ignore the waiter.
1864 	 */
1865 	old = atomic_read_acquire(&q->requeue_state);
1866 	do {
1867 		if (old == Q_REQUEUE_PI_IGNORE)
1868 			return false;
1869 
1870 		/*
1871 		 * futex_proxy_trylock_atomic() might have set it to
1872 		 * IN_PROGRESS and a interleaved early wake to WAIT.
1873 		 *
1874 		 * It was considered to have an extra state for that
1875 		 * trylock, but that would just add more conditionals
1876 		 * all over the place for a dubious value.
1877 		 */
1878 		if (old != Q_REQUEUE_PI_NONE)
1879 			break;
1880 
1881 		new = Q_REQUEUE_PI_IN_PROGRESS;
1882 	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1883 
1884 	q->pi_state = pi_state;
1885 	return true;
1886 }
1887 
futex_requeue_pi_complete(struct futex_q * q,int locked)1888 static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
1889 {
1890 	int old, new;
1891 
1892 	old = atomic_read_acquire(&q->requeue_state);
1893 	do {
1894 		if (old == Q_REQUEUE_PI_IGNORE)
1895 			return;
1896 
1897 		if (locked >= 0) {
1898 			/* Requeue succeeded. Set DONE or LOCKED */
1899 			WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
1900 				     old != Q_REQUEUE_PI_WAIT);
1901 			new = Q_REQUEUE_PI_DONE + locked;
1902 		} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
1903 			/* Deadlock, no early wakeup interleave */
1904 			new = Q_REQUEUE_PI_NONE;
1905 		} else {
1906 			/* Deadlock, early wakeup interleave. */
1907 			WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
1908 			new = Q_REQUEUE_PI_IGNORE;
1909 		}
1910 	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1911 
1912 #ifdef CONFIG_PREEMPT_RT
1913 	/* If the waiter interleaved with the requeue let it know */
1914 	if (unlikely(old == Q_REQUEUE_PI_WAIT))
1915 		rcuwait_wake_up(&q->requeue_wait);
1916 #endif
1917 }
1918 
futex_requeue_pi_wakeup_sync(struct futex_q * q)1919 static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
1920 {
1921 	int old, new;
1922 
1923 	old = atomic_read_acquire(&q->requeue_state);
1924 	do {
1925 		/* Is requeue done already? */
1926 		if (old >= Q_REQUEUE_PI_DONE)
1927 			return old;
1928 
1929 		/*
1930 		 * If not done, then tell the requeue code to either ignore
1931 		 * the waiter or to wake it up once the requeue is done.
1932 		 */
1933 		new = Q_REQUEUE_PI_WAIT;
1934 		if (old == Q_REQUEUE_PI_NONE)
1935 			new = Q_REQUEUE_PI_IGNORE;
1936 	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1937 
1938 	/* If the requeue was in progress, wait for it to complete */
1939 	if (old == Q_REQUEUE_PI_IN_PROGRESS) {
1940 #ifdef CONFIG_PREEMPT_RT
1941 		rcuwait_wait_event(&q->requeue_wait,
1942 				   atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
1943 				   TASK_UNINTERRUPTIBLE);
1944 #else
1945 		(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
1946 #endif
1947 	}
1948 
1949 	/*
1950 	 * Requeue is now either prohibited or complete. Reread state
1951 	 * because during the wait above it might have changed. Nothing
1952 	 * will modify q->requeue_state after this point.
1953 	 */
1954 	return atomic_read(&q->requeue_state);
1955 }
1956 
1957 /**
1958  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1959  * @q:		the futex_q
1960  * @key:	the key of the requeue target futex
1961  * @hb:		the hash_bucket of the requeue target futex
1962  *
1963  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1964  * target futex if it is uncontended or via a lock steal.
1965  *
1966  * 1) Set @q::key to the requeue target futex key so the waiter can detect
1967  *    the wakeup on the right futex.
1968  *
1969  * 2) Dequeue @q from the hash bucket.
1970  *
1971  * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
1972  *    acquisition.
1973  *
1974  * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
1975  *    the waiter has to fixup the pi state.
1976  *
1977  * 5) Complete the requeue state so the waiter can make progress. After
1978  *    this point the waiter task can return from the syscall immediately in
1979  *    case that the pi state does not have to be fixed up.
1980  *
1981  * 6) Wake the waiter task.
1982  *
1983  * Must be called with both q->lock_ptr and hb->lock held.
1984  */
1985 static inline
requeue_pi_wake_futex(struct futex_q * q,union futex_key * key,struct futex_hash_bucket * hb)1986 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1987 			   struct futex_hash_bucket *hb)
1988 {
1989 	q->key = *key;
1990 
1991 	__unqueue_futex(q);
1992 
1993 	WARN_ON(!q->rt_waiter);
1994 	q->rt_waiter = NULL;
1995 
1996 	q->lock_ptr = &hb->lock;
1997 
1998 	/* Signal locked state to the waiter */
1999 	futex_requeue_pi_complete(q, 1);
2000 	wake_up_state(q->task, TASK_NORMAL);
2001 }
2002 
2003 /**
2004  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
2005  * @pifutex:		the user address of the to futex
2006  * @hb1:		the from futex hash bucket, must be locked by the caller
2007  * @hb2:		the to futex hash bucket, must be locked by the caller
2008  * @key1:		the from futex key
2009  * @key2:		the to futex key
2010  * @ps:			address to store the pi_state pointer
2011  * @exiting:		Pointer to store the task pointer of the owner task
2012  *			which is in the middle of exiting
2013  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
2014  *
2015  * Try and get the lock on behalf of the top waiter if we can do it atomically.
2016  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
2017  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
2018  * hb1 and hb2 must be held by the caller.
2019  *
2020  * @exiting is only set when the return value is -EBUSY. If so, this holds
2021  * a refcount on the exiting task on return and the caller needs to drop it
2022  * after waiting for the exit to complete.
2023  *
2024  * Return:
2025  *  -  0 - failed to acquire the lock atomically;
2026  *  - >0 - acquired the lock, return value is vpid of the top_waiter
2027  *  - <0 - error
2028  */
2029 static int
futex_proxy_trylock_atomic(u32 __user * pifutex,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key1,union futex_key * key2,struct futex_pi_state ** ps,struct task_struct ** exiting,int set_waiters)2030 futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
2031 			   struct futex_hash_bucket *hb2, union futex_key *key1,
2032 			   union futex_key *key2, struct futex_pi_state **ps,
2033 			   struct task_struct **exiting, int set_waiters)
2034 {
2035 	struct futex_q *top_waiter = NULL;
2036 	u32 curval;
2037 	int ret;
2038 
2039 	if (get_futex_value_locked(&curval, pifutex))
2040 		return -EFAULT;
2041 
2042 	if (unlikely(should_fail_futex(true)))
2043 		return -EFAULT;
2044 
2045 	/*
2046 	 * Find the top_waiter and determine if there are additional waiters.
2047 	 * If the caller intends to requeue more than 1 waiter to pifutex,
2048 	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
2049 	 * as we have means to handle the possible fault.  If not, don't set
2050 	 * the bit unnecessarily as it will force the subsequent unlock to enter
2051 	 * the kernel.
2052 	 */
2053 	top_waiter = futex_top_waiter(hb1, key1);
2054 
2055 	/* There are no waiters, nothing for us to do. */
2056 	if (!top_waiter)
2057 		return 0;
2058 
2059 	/*
2060 	 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
2061 	 * and waiting on the 'waitqueue' futex which is always !PI.
2062 	 */
2063 	if (!top_waiter->rt_waiter || top_waiter->pi_state)
2064 		return -EINVAL;
2065 
2066 	/* Ensure we requeue to the expected futex. */
2067 	if (!match_futex(top_waiter->requeue_pi_key, key2))
2068 		return -EINVAL;
2069 
2070 	/* Ensure that this does not race against an early wakeup */
2071 	if (!futex_requeue_pi_prepare(top_waiter, NULL))
2072 		return -EAGAIN;
2073 
2074 	/*
2075 	 * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
2076 	 * in the contended case or if @set_waiters is true.
2077 	 *
2078 	 * In the contended case PI state is attached to the lock owner. If
2079 	 * the user space lock can be acquired then PI state is attached to
2080 	 * the new owner (@top_waiter->task) when @set_waiters is true.
2081 	 */
2082 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
2083 				   exiting, set_waiters);
2084 	if (ret == 1) {
2085 		/*
2086 		 * Lock was acquired in user space and PI state was
2087 		 * attached to @top_waiter->task. That means state is fully
2088 		 * consistent and the waiter can return to user space
2089 		 * immediately after the wakeup.
2090 		 */
2091 		requeue_pi_wake_futex(top_waiter, key2, hb2);
2092 	} else if (ret < 0) {
2093 		/* Rewind top_waiter::requeue_state */
2094 		futex_requeue_pi_complete(top_waiter, ret);
2095 	} else {
2096 		/*
2097 		 * futex_lock_pi_atomic() did not acquire the user space
2098 		 * futex, but managed to establish the proxy lock and pi
2099 		 * state. top_waiter::requeue_state cannot be fixed up here
2100 		 * because the waiter is not enqueued on the rtmutex
2101 		 * yet. This is handled at the callsite depending on the
2102 		 * result of rt_mutex_start_proxy_lock() which is
2103 		 * guaranteed to be reached with this function returning 0.
2104 		 */
2105 	}
2106 	return ret;
2107 }
2108 
2109 /**
2110  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
2111  * @uaddr1:	source futex user address
2112  * @flags:	futex flags (FLAGS_SHARED, etc.)
2113  * @uaddr2:	target futex user address
2114  * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
2115  * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
2116  * @cmpval:	@uaddr1 expected value (or %NULL)
2117  * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a
2118  *		pi futex (pi to pi requeue is not supported)
2119  *
2120  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
2121  * uaddr2 atomically on behalf of the top waiter.
2122  *
2123  * Return:
2124  *  - >=0 - on success, the number of tasks requeued or woken;
2125  *  -  <0 - on error
2126  */
futex_requeue(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_requeue,u32 * cmpval,int requeue_pi)2127 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
2128 			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
2129 			 u32 *cmpval, int requeue_pi)
2130 {
2131 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
2132 	int task_count = 0, ret;
2133 	struct futex_pi_state *pi_state = NULL;
2134 	struct futex_hash_bucket *hb1, *hb2;
2135 	struct futex_q *this, *next;
2136 	DEFINE_WAKE_Q(wake_q);
2137 
2138 	if (nr_wake < 0 || nr_requeue < 0)
2139 		return -EINVAL;
2140 
2141 	/*
2142 	 * When PI not supported: return -ENOSYS if requeue_pi is true,
2143 	 * consequently the compiler knows requeue_pi is always false past
2144 	 * this point which will optimize away all the conditional code
2145 	 * further down.
2146 	 */
2147 	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
2148 		return -ENOSYS;
2149 
2150 	if (requeue_pi) {
2151 		/*
2152 		 * Requeue PI only works on two distinct uaddrs. This
2153 		 * check is only valid for private futexes. See below.
2154 		 */
2155 		if (uaddr1 == uaddr2)
2156 			return -EINVAL;
2157 
2158 		/*
2159 		 * futex_requeue() allows the caller to define the number
2160 		 * of waiters to wake up via the @nr_wake argument. With
2161 		 * REQUEUE_PI, waking up more than one waiter is creating
2162 		 * more problems than it solves. Waking up a waiter makes
2163 		 * only sense if the PI futex @uaddr2 is uncontended as
2164 		 * this allows the requeue code to acquire the futex
2165 		 * @uaddr2 before waking the waiter. The waiter can then
2166 		 * return to user space without further action. A secondary
2167 		 * wakeup would just make the futex_wait_requeue_pi()
2168 		 * handling more complex, because that code would have to
2169 		 * look up pi_state and do more or less all the handling
2170 		 * which the requeue code has to do for the to be requeued
2171 		 * waiters. So restrict the number of waiters to wake to
2172 		 * one, and only wake it up when the PI futex is
2173 		 * uncontended. Otherwise requeue it and let the unlock of
2174 		 * the PI futex handle the wakeup.
2175 		 *
2176 		 * All REQUEUE_PI users, e.g. pthread_cond_signal() and
2177 		 * pthread_cond_broadcast() must use nr_wake=1.
2178 		 */
2179 		if (nr_wake != 1)
2180 			return -EINVAL;
2181 
2182 		/*
2183 		 * requeue_pi requires a pi_state, try to allocate it now
2184 		 * without any locks in case it fails.
2185 		 */
2186 		if (refill_pi_state_cache())
2187 			return -ENOMEM;
2188 	}
2189 
2190 retry:
2191 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
2192 	if (unlikely(ret != 0))
2193 		return ret;
2194 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
2195 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
2196 	if (unlikely(ret != 0))
2197 		return ret;
2198 
2199 	/*
2200 	 * The check above which compares uaddrs is not sufficient for
2201 	 * shared futexes. We need to compare the keys:
2202 	 */
2203 	if (requeue_pi && match_futex(&key1, &key2))
2204 		return -EINVAL;
2205 
2206 	hb1 = hash_futex(&key1);
2207 	hb2 = hash_futex(&key2);
2208 
2209 retry_private:
2210 	hb_waiters_inc(hb2);
2211 	double_lock_hb(hb1, hb2);
2212 
2213 	if (likely(cmpval != NULL)) {
2214 		u32 curval;
2215 
2216 		ret = get_futex_value_locked(&curval, uaddr1);
2217 
2218 		if (unlikely(ret)) {
2219 			double_unlock_hb(hb1, hb2);
2220 			hb_waiters_dec(hb2);
2221 
2222 			ret = get_user(curval, uaddr1);
2223 			if (ret)
2224 				return ret;
2225 
2226 			if (!(flags & FLAGS_SHARED))
2227 				goto retry_private;
2228 
2229 			goto retry;
2230 		}
2231 		if (curval != *cmpval) {
2232 			ret = -EAGAIN;
2233 			goto out_unlock;
2234 		}
2235 	}
2236 
2237 	if (requeue_pi) {
2238 		struct task_struct *exiting = NULL;
2239 
2240 		/*
2241 		 * Attempt to acquire uaddr2 and wake the top waiter. If we
2242 		 * intend to requeue waiters, force setting the FUTEX_WAITERS
2243 		 * bit.  We force this here where we are able to easily handle
2244 		 * faults rather in the requeue loop below.
2245 		 *
2246 		 * Updates topwaiter::requeue_state if a top waiter exists.
2247 		 */
2248 		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2249 						 &key2, &pi_state,
2250 						 &exiting, nr_requeue);
2251 
2252 		/*
2253 		 * At this point the top_waiter has either taken uaddr2 or
2254 		 * is waiting on it. In both cases pi_state has been
2255 		 * established and an initial refcount on it. In case of an
2256 		 * error there's nothing.
2257 		 *
2258 		 * The top waiter's requeue_state is up to date:
2259 		 *
2260 		 *  - If the lock was acquired atomically (ret == 1), then
2261 		 *    the state is Q_REQUEUE_PI_LOCKED.
2262 		 *
2263 		 *    The top waiter has been dequeued and woken up and can
2264 		 *    return to user space immediately. The kernel/user
2265 		 *    space state is consistent. In case that there must be
2266 		 *    more waiters requeued the WAITERS bit in the user
2267 		 *    space futex is set so the top waiter task has to go
2268 		 *    into the syscall slowpath to unlock the futex. This
2269 		 *    will block until this requeue operation has been
2270 		 *    completed and the hash bucket locks have been
2271 		 *    dropped.
2272 		 *
2273 		 *  - If the trylock failed with an error (ret < 0) then
2274 		 *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
2275 		 *    happened", or Q_REQUEUE_PI_IGNORE when there was an
2276 		 *    interleaved early wakeup.
2277 		 *
2278 		 *  - If the trylock did not succeed (ret == 0) then the
2279 		 *    state is either Q_REQUEUE_PI_IN_PROGRESS or
2280 		 *    Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
2281 		 *    This will be cleaned up in the loop below, which
2282 		 *    cannot fail because futex_proxy_trylock_atomic() did
2283 		 *    the same sanity checks for requeue_pi as the loop
2284 		 *    below does.
2285 		 */
2286 		switch (ret) {
2287 		case 0:
2288 			/* We hold a reference on the pi state. */
2289 			break;
2290 
2291 		case 1:
2292 			/*
2293 			 * futex_proxy_trylock_atomic() acquired the user space
2294 			 * futex. Adjust task_count.
2295 			 */
2296 			task_count++;
2297 			ret = 0;
2298 			break;
2299 
2300 		/*
2301 		 * If the above failed, then pi_state is NULL and
2302 		 * waiter::requeue_state is correct.
2303 		 */
2304 		case -EFAULT:
2305 			double_unlock_hb(hb1, hb2);
2306 			hb_waiters_dec(hb2);
2307 			ret = fault_in_user_writeable(uaddr2);
2308 			if (!ret)
2309 				goto retry;
2310 			return ret;
2311 		case -EBUSY:
2312 		case -EAGAIN:
2313 			/*
2314 			 * Two reasons for this:
2315 			 * - EBUSY: Owner is exiting and we just wait for the
2316 			 *   exit to complete.
2317 			 * - EAGAIN: The user space value changed.
2318 			 */
2319 			double_unlock_hb(hb1, hb2);
2320 			hb_waiters_dec(hb2);
2321 			/*
2322 			 * Handle the case where the owner is in the middle of
2323 			 * exiting. Wait for the exit to complete otherwise
2324 			 * this task might loop forever, aka. live lock.
2325 			 */
2326 			wait_for_owner_exiting(ret, exiting);
2327 			cond_resched();
2328 			goto retry;
2329 		default:
2330 			goto out_unlock;
2331 		}
2332 	}
2333 
2334 	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2335 		if (task_count - nr_wake >= nr_requeue)
2336 			break;
2337 
2338 		if (!match_futex(&this->key, &key1))
2339 			continue;
2340 
2341 		/*
2342 		 * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2343 		 * be paired with each other and no other futex ops.
2344 		 *
2345 		 * We should never be requeueing a futex_q with a pi_state,
2346 		 * which is awaiting a futex_unlock_pi().
2347 		 */
2348 		if ((requeue_pi && !this->rt_waiter) ||
2349 		    (!requeue_pi && this->rt_waiter) ||
2350 		    this->pi_state) {
2351 			ret = -EINVAL;
2352 			break;
2353 		}
2354 
2355 		/* Plain futexes just wake or requeue and are done */
2356 		if (!requeue_pi) {
2357 			if (++task_count <= nr_wake)
2358 				mark_wake_futex(&wake_q, this);
2359 			else
2360 				requeue_futex(this, hb1, hb2, &key2);
2361 			continue;
2362 		}
2363 
2364 		/* Ensure we requeue to the expected futex for requeue_pi. */
2365 		if (!match_futex(this->requeue_pi_key, &key2)) {
2366 			ret = -EINVAL;
2367 			break;
2368 		}
2369 
2370 		/*
2371 		 * Requeue nr_requeue waiters and possibly one more in the case
2372 		 * of requeue_pi if we couldn't acquire the lock atomically.
2373 		 *
2374 		 * Prepare the waiter to take the rt_mutex. Take a refcount
2375 		 * on the pi_state and store the pointer in the futex_q
2376 		 * object of the waiter.
2377 		 */
2378 		get_pi_state(pi_state);
2379 
2380 		/* Don't requeue when the waiter is already on the way out. */
2381 		if (!futex_requeue_pi_prepare(this, pi_state)) {
2382 			/*
2383 			 * Early woken waiter signaled that it is on the
2384 			 * way out. Drop the pi_state reference and try the
2385 			 * next waiter. @this->pi_state is still NULL.
2386 			 */
2387 			put_pi_state(pi_state);
2388 			continue;
2389 		}
2390 
2391 		ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2392 						this->rt_waiter,
2393 						this->task);
2394 
2395 		if (ret == 1) {
2396 			/*
2397 			 * We got the lock. We do neither drop the refcount
2398 			 * on pi_state nor clear this->pi_state because the
2399 			 * waiter needs the pi_state for cleaning up the
2400 			 * user space value. It will drop the refcount
2401 			 * after doing so. this::requeue_state is updated
2402 			 * in the wakeup as well.
2403 			 */
2404 			requeue_pi_wake_futex(this, &key2, hb2);
2405 			task_count++;
2406 		} else if (!ret) {
2407 			/* Waiter is queued, move it to hb2 */
2408 			requeue_futex(this, hb1, hb2, &key2);
2409 			futex_requeue_pi_complete(this, 0);
2410 			task_count++;
2411 		} else {
2412 			/*
2413 			 * rt_mutex_start_proxy_lock() detected a potential
2414 			 * deadlock when we tried to queue that waiter.
2415 			 * Drop the pi_state reference which we took above
2416 			 * and remove the pointer to the state from the
2417 			 * waiters futex_q object.
2418 			 */
2419 			this->pi_state = NULL;
2420 			put_pi_state(pi_state);
2421 			futex_requeue_pi_complete(this, ret);
2422 			/*
2423 			 * We stop queueing more waiters and let user space
2424 			 * deal with the mess.
2425 			 */
2426 			break;
2427 		}
2428 	}
2429 
2430 	/*
2431 	 * We took an extra initial reference to the pi_state in
2432 	 * futex_proxy_trylock_atomic(). We need to drop it here again.
2433 	 */
2434 	put_pi_state(pi_state);
2435 
2436 out_unlock:
2437 	double_unlock_hb(hb1, hb2);
2438 	wake_up_q(&wake_q);
2439 	hb_waiters_dec(hb2);
2440 	return ret ? ret : task_count;
2441 }
2442 
2443 /* The key must be already stored in q->key. */
queue_lock(struct futex_q * q)2444 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2445 	__acquires(&hb->lock)
2446 {
2447 	struct futex_hash_bucket *hb;
2448 
2449 	hb = hash_futex(&q->key);
2450 
2451 	/*
2452 	 * Increment the counter before taking the lock so that
2453 	 * a potential waker won't miss a to-be-slept task that is
2454 	 * waiting for the spinlock. This is safe as all queue_lock()
2455 	 * users end up calling queue_me(). Similarly, for housekeeping,
2456 	 * decrement the counter at queue_unlock() when some error has
2457 	 * occurred and we don't end up adding the task to the list.
2458 	 */
2459 	hb_waiters_inc(hb); /* implies smp_mb(); (A) */
2460 
2461 	q->lock_ptr = &hb->lock;
2462 
2463 	spin_lock(&hb->lock);
2464 	return hb;
2465 }
2466 
2467 static inline void
queue_unlock(struct futex_hash_bucket * hb)2468 queue_unlock(struct futex_hash_bucket *hb)
2469 	__releases(&hb->lock)
2470 {
2471 	spin_unlock(&hb->lock);
2472 	hb_waiters_dec(hb);
2473 }
2474 
__queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2475 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2476 {
2477 	int prio;
2478 
2479 	/*
2480 	 * The priority used to register this element is
2481 	 * - either the real thread-priority for the real-time threads
2482 	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
2483 	 * - or MAX_RT_PRIO for non-RT threads.
2484 	 * Thus, all RT-threads are woken first in priority order, and
2485 	 * the others are woken last, in FIFO order.
2486 	 */
2487 	prio = min(current->normal_prio, MAX_RT_PRIO);
2488 
2489 	plist_node_init(&q->list, prio);
2490 	plist_add(&q->list, &hb->chain);
2491 	q->task = current;
2492 }
2493 
2494 /**
2495  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2496  * @q:	The futex_q to enqueue
2497  * @hb:	The destination hash bucket
2498  *
2499  * The hb->lock must be held by the caller, and is released here. A call to
2500  * queue_me() is typically paired with exactly one call to unqueue_me().  The
2501  * exceptions involve the PI related operations, which may use unqueue_me_pi()
2502  * or nothing if the unqueue is done as part of the wake process and the unqueue
2503  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2504  * an example).
2505  */
queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2506 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2507 	__releases(&hb->lock)
2508 {
2509 	__queue_me(q, hb);
2510 	spin_unlock(&hb->lock);
2511 }
2512 
2513 /**
2514  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2515  * @q:	The futex_q to unqueue
2516  *
2517  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2518  * be paired with exactly one earlier call to queue_me().
2519  *
2520  * Return:
2521  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
2522  *  - 0 - if the futex_q was already removed by the waking thread
2523  */
unqueue_me(struct futex_q * q)2524 static int unqueue_me(struct futex_q *q)
2525 {
2526 	spinlock_t *lock_ptr;
2527 	int ret = 0;
2528 
2529 	/* In the common case we don't take the spinlock, which is nice. */
2530 retry:
2531 	/*
2532 	 * q->lock_ptr can change between this read and the following spin_lock.
2533 	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2534 	 * optimizing lock_ptr out of the logic below.
2535 	 */
2536 	lock_ptr = READ_ONCE(q->lock_ptr);
2537 	if (lock_ptr != NULL) {
2538 		spin_lock(lock_ptr);
2539 		/*
2540 		 * q->lock_ptr can change between reading it and
2541 		 * spin_lock(), causing us to take the wrong lock.  This
2542 		 * corrects the race condition.
2543 		 *
2544 		 * Reasoning goes like this: if we have the wrong lock,
2545 		 * q->lock_ptr must have changed (maybe several times)
2546 		 * between reading it and the spin_lock().  It can
2547 		 * change again after the spin_lock() but only if it was
2548 		 * already changed before the spin_lock().  It cannot,
2549 		 * however, change back to the original value.  Therefore
2550 		 * we can detect whether we acquired the correct lock.
2551 		 */
2552 		if (unlikely(lock_ptr != q->lock_ptr)) {
2553 			spin_unlock(lock_ptr);
2554 			goto retry;
2555 		}
2556 		__unqueue_futex(q);
2557 
2558 		BUG_ON(q->pi_state);
2559 
2560 		spin_unlock(lock_ptr);
2561 		ret = 1;
2562 	}
2563 
2564 	return ret;
2565 }
2566 
2567 /*
2568  * PI futexes can not be requeued and must remove themselves from the
2569  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
2570  */
unqueue_me_pi(struct futex_q * q)2571 static void unqueue_me_pi(struct futex_q *q)
2572 {
2573 	__unqueue_futex(q);
2574 
2575 	BUG_ON(!q->pi_state);
2576 	put_pi_state(q->pi_state);
2577 	q->pi_state = NULL;
2578 }
2579 
__fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)2580 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2581 				  struct task_struct *argowner)
2582 {
2583 	struct futex_pi_state *pi_state = q->pi_state;
2584 	struct task_struct *oldowner, *newowner;
2585 	u32 uval, curval, newval, newtid;
2586 	int err = 0;
2587 
2588 	oldowner = pi_state->owner;
2589 
2590 	/*
2591 	 * We are here because either:
2592 	 *
2593 	 *  - we stole the lock and pi_state->owner needs updating to reflect
2594 	 *    that (@argowner == current),
2595 	 *
2596 	 * or:
2597 	 *
2598 	 *  - someone stole our lock and we need to fix things to point to the
2599 	 *    new owner (@argowner == NULL).
2600 	 *
2601 	 * Either way, we have to replace the TID in the user space variable.
2602 	 * This must be atomic as we have to preserve the owner died bit here.
2603 	 *
2604 	 * Note: We write the user space value _before_ changing the pi_state
2605 	 * because we can fault here. Imagine swapped out pages or a fork
2606 	 * that marked all the anonymous memory readonly for cow.
2607 	 *
2608 	 * Modifying pi_state _before_ the user space value would leave the
2609 	 * pi_state in an inconsistent state when we fault here, because we
2610 	 * need to drop the locks to handle the fault. This might be observed
2611 	 * in the PID checks when attaching to PI state .
2612 	 */
2613 retry:
2614 	if (!argowner) {
2615 		if (oldowner != current) {
2616 			/*
2617 			 * We raced against a concurrent self; things are
2618 			 * already fixed up. Nothing to do.
2619 			 */
2620 			return 0;
2621 		}
2622 
2623 		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2624 			/* We got the lock. pi_state is correct. Tell caller. */
2625 			return 1;
2626 		}
2627 
2628 		/*
2629 		 * The trylock just failed, so either there is an owner or
2630 		 * there is a higher priority waiter than this one.
2631 		 */
2632 		newowner = rt_mutex_owner(&pi_state->pi_mutex);
2633 		/*
2634 		 * If the higher priority waiter has not yet taken over the
2635 		 * rtmutex then newowner is NULL. We can't return here with
2636 		 * that state because it's inconsistent vs. the user space
2637 		 * state. So drop the locks and try again. It's a valid
2638 		 * situation and not any different from the other retry
2639 		 * conditions.
2640 		 */
2641 		if (unlikely(!newowner)) {
2642 			err = -EAGAIN;
2643 			goto handle_err;
2644 		}
2645 	} else {
2646 		WARN_ON_ONCE(argowner != current);
2647 		if (oldowner == current) {
2648 			/*
2649 			 * We raced against a concurrent self; things are
2650 			 * already fixed up. Nothing to do.
2651 			 */
2652 			return 1;
2653 		}
2654 		newowner = argowner;
2655 	}
2656 
2657 	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2658 	/* Owner died? */
2659 	if (!pi_state->owner)
2660 		newtid |= FUTEX_OWNER_DIED;
2661 
2662 	err = get_futex_value_locked(&uval, uaddr);
2663 	if (err)
2664 		goto handle_err;
2665 
2666 	for (;;) {
2667 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
2668 
2669 		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2670 		if (err)
2671 			goto handle_err;
2672 
2673 		if (curval == uval)
2674 			break;
2675 		uval = curval;
2676 	}
2677 
2678 	/*
2679 	 * We fixed up user space. Now we need to fix the pi_state
2680 	 * itself.
2681 	 */
2682 	pi_state_update_owner(pi_state, newowner);
2683 
2684 	return argowner == current;
2685 
2686 	/*
2687 	 * In order to reschedule or handle a page fault, we need to drop the
2688 	 * locks here. In the case of a fault, this gives the other task
2689 	 * (either the highest priority waiter itself or the task which stole
2690 	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
2691 	 * are back from handling the fault we need to check the pi_state after
2692 	 * reacquiring the locks and before trying to do another fixup. When
2693 	 * the fixup has been done already we simply return.
2694 	 *
2695 	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2696 	 * drop hb->lock since the caller owns the hb -> futex_q relation.
2697 	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2698 	 */
2699 handle_err:
2700 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2701 	spin_unlock(q->lock_ptr);
2702 
2703 	switch (err) {
2704 	case -EFAULT:
2705 		err = fault_in_user_writeable(uaddr);
2706 		break;
2707 
2708 	case -EAGAIN:
2709 		cond_resched();
2710 		err = 0;
2711 		break;
2712 
2713 	default:
2714 		WARN_ON_ONCE(1);
2715 		break;
2716 	}
2717 
2718 	spin_lock(q->lock_ptr);
2719 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2720 
2721 	/*
2722 	 * Check if someone else fixed it for us:
2723 	 */
2724 	if (pi_state->owner != oldowner)
2725 		return argowner == current;
2726 
2727 	/* Retry if err was -EAGAIN or the fault in succeeded */
2728 	if (!err)
2729 		goto retry;
2730 
2731 	/*
2732 	 * fault_in_user_writeable() failed so user state is immutable. At
2733 	 * best we can make the kernel state consistent but user state will
2734 	 * be most likely hosed and any subsequent unlock operation will be
2735 	 * rejected due to PI futex rule [10].
2736 	 *
2737 	 * Ensure that the rtmutex owner is also the pi_state owner despite
2738 	 * the user space value claiming something different. There is no
2739 	 * point in unlocking the rtmutex if current is the owner as it
2740 	 * would need to wait until the next waiter has taken the rtmutex
2741 	 * to guarantee consistent state. Keep it simple. Userspace asked
2742 	 * for this wreckaged state.
2743 	 *
2744 	 * The rtmutex has an owner - either current or some other
2745 	 * task. See the EAGAIN loop above.
2746 	 */
2747 	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
2748 
2749 	return err;
2750 }
2751 
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)2752 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2753 				struct task_struct *argowner)
2754 {
2755 	struct futex_pi_state *pi_state = q->pi_state;
2756 	int ret;
2757 
2758 	lockdep_assert_held(q->lock_ptr);
2759 
2760 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2761 	ret = __fixup_pi_state_owner(uaddr, q, argowner);
2762 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2763 	return ret;
2764 }
2765 
2766 static long futex_wait_restart(struct restart_block *restart);
2767 
2768 /**
2769  * fixup_owner() - Post lock pi_state and corner case management
2770  * @uaddr:	user address of the futex
2771  * @q:		futex_q (contains pi_state and access to the rt_mutex)
2772  * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
2773  *
2774  * After attempting to lock an rt_mutex, this function is called to cleanup
2775  * the pi_state owner as well as handle race conditions that may allow us to
2776  * acquire the lock. Must be called with the hb lock held.
2777  *
2778  * Return:
2779  *  -  1 - success, lock taken;
2780  *  -  0 - success, lock not taken;
2781  *  - <0 - on error (-EFAULT)
2782  */
fixup_owner(u32 __user * uaddr,struct futex_q * q,int locked)2783 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2784 {
2785 	if (locked) {
2786 		/*
2787 		 * Got the lock. We might not be the anticipated owner if we
2788 		 * did a lock-steal - fix up the PI-state in that case:
2789 		 *
2790 		 * Speculative pi_state->owner read (we don't hold wait_lock);
2791 		 * since we own the lock pi_state->owner == current is the
2792 		 * stable state, anything else needs more attention.
2793 		 */
2794 		if (q->pi_state->owner != current)
2795 			return fixup_pi_state_owner(uaddr, q, current);
2796 		return 1;
2797 	}
2798 
2799 	/*
2800 	 * If we didn't get the lock; check if anybody stole it from us. In
2801 	 * that case, we need to fix up the uval to point to them instead of
2802 	 * us, otherwise bad things happen. [10]
2803 	 *
2804 	 * Another speculative read; pi_state->owner == current is unstable
2805 	 * but needs our attention.
2806 	 */
2807 	if (q->pi_state->owner == current)
2808 		return fixup_pi_state_owner(uaddr, q, NULL);
2809 
2810 	/*
2811 	 * Paranoia check. If we did not take the lock, then we should not be
2812 	 * the owner of the rt_mutex. Warn and establish consistent state.
2813 	 */
2814 	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
2815 		return fixup_pi_state_owner(uaddr, q, current);
2816 
2817 	return 0;
2818 }
2819 
2820 /**
2821  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2822  * @hb:		the futex hash bucket, must be locked by the caller
2823  * @q:		the futex_q to queue up on
2824  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
2825  */
futex_wait_queue_me(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)2826 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2827 				struct hrtimer_sleeper *timeout)
2828 {
2829 	/*
2830 	 * The task state is guaranteed to be set before another task can
2831 	 * wake it. set_current_state() is implemented using smp_store_mb() and
2832 	 * queue_me() calls spin_unlock() upon completion, both serializing
2833 	 * access to the hash list and forcing another memory barrier.
2834 	 */
2835 	set_current_state(TASK_INTERRUPTIBLE);
2836 	queue_me(q, hb);
2837 
2838 	/* Arm the timer */
2839 	if (timeout)
2840 		hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
2841 
2842 	/*
2843 	 * If we have been removed from the hash list, then another task
2844 	 * has tried to wake us, and we can skip the call to schedule().
2845 	 */
2846 	if (likely(!plist_node_empty(&q->list))) {
2847 		/*
2848 		 * If the timer has already expired, current will already be
2849 		 * flagged for rescheduling. Only call schedule if there
2850 		 * is no timeout, or if it has yet to expire.
2851 		 */
2852 		if (!timeout || timeout->task)
2853 			freezable_schedule();
2854 	}
2855 	__set_current_state(TASK_RUNNING);
2856 }
2857 
2858 /**
2859  * futex_wait_setup() - Prepare to wait on a futex
2860  * @uaddr:	the futex userspace address
2861  * @val:	the expected value
2862  * @flags:	futex flags (FLAGS_SHARED, etc.)
2863  * @q:		the associated futex_q
2864  * @hb:		storage for hash_bucket pointer to be returned to caller
2865  *
2866  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2867  * compare it with the expected value.  Handle atomic faults internally.
2868  * Return with the hb lock held on success, and unlocked on failure.
2869  *
2870  * Return:
2871  *  -  0 - uaddr contains val and hb has been locked;
2872  *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2873  */
futex_wait_setup(u32 __user * uaddr,u32 val,unsigned int flags,struct futex_q * q,struct futex_hash_bucket ** hb)2874 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2875 			   struct futex_q *q, struct futex_hash_bucket **hb)
2876 {
2877 	u32 uval;
2878 	int ret;
2879 
2880 	/*
2881 	 * Access the page AFTER the hash-bucket is locked.
2882 	 * Order is important:
2883 	 *
2884 	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2885 	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2886 	 *
2887 	 * The basic logical guarantee of a futex is that it blocks ONLY
2888 	 * if cond(var) is known to be true at the time of blocking, for
2889 	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
2890 	 * would open a race condition where we could block indefinitely with
2891 	 * cond(var) false, which would violate the guarantee.
2892 	 *
2893 	 * On the other hand, we insert q and release the hash-bucket only
2894 	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
2895 	 * absorb a wakeup if *uaddr does not match the desired values
2896 	 * while the syscall executes.
2897 	 */
2898 retry:
2899 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
2900 	if (unlikely(ret != 0))
2901 		return ret;
2902 
2903 retry_private:
2904 	*hb = queue_lock(q);
2905 
2906 	ret = get_futex_value_locked(&uval, uaddr);
2907 
2908 	if (ret) {
2909 		queue_unlock(*hb);
2910 
2911 		ret = get_user(uval, uaddr);
2912 		if (ret)
2913 			return ret;
2914 
2915 		if (!(flags & FLAGS_SHARED))
2916 			goto retry_private;
2917 
2918 		goto retry;
2919 	}
2920 
2921 	if (uval != val) {
2922 		queue_unlock(*hb);
2923 		ret = -EWOULDBLOCK;
2924 	}
2925 
2926 	return ret;
2927 }
2928 
futex_wait(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset)2929 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2930 		      ktime_t *abs_time, u32 bitset)
2931 {
2932 	struct hrtimer_sleeper timeout, *to;
2933 	struct restart_block *restart;
2934 	struct futex_hash_bucket *hb;
2935 	struct futex_q q = futex_q_init;
2936 	int ret;
2937 
2938 	if (!bitset)
2939 		return -EINVAL;
2940 	q.bitset = bitset;
2941 
2942 	to = futex_setup_timer(abs_time, &timeout, flags,
2943 			       current->timer_slack_ns);
2944 retry:
2945 	/*
2946 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
2947 	 * is initialized.
2948 	 */
2949 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2950 	if (ret)
2951 		goto out;
2952 
2953 	/* queue_me and wait for wakeup, timeout, or a signal. */
2954 	futex_wait_queue_me(hb, &q, to);
2955 
2956 	/* If we were woken (and unqueued), we succeeded, whatever. */
2957 	ret = 0;
2958 	if (!unqueue_me(&q))
2959 		goto out;
2960 	ret = -ETIMEDOUT;
2961 	if (to && !to->task)
2962 		goto out;
2963 
2964 	/*
2965 	 * We expect signal_pending(current), but we might be the
2966 	 * victim of a spurious wakeup as well.
2967 	 */
2968 	if (!signal_pending(current))
2969 		goto retry;
2970 
2971 	ret = -ERESTARTSYS;
2972 	if (!abs_time)
2973 		goto out;
2974 
2975 	restart = &current->restart_block;
2976 	restart->futex.uaddr = uaddr;
2977 	restart->futex.val = val;
2978 	restart->futex.time = *abs_time;
2979 	restart->futex.bitset = bitset;
2980 	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2981 
2982 	ret = set_restart_fn(restart, futex_wait_restart);
2983 
2984 out:
2985 	if (to) {
2986 		hrtimer_cancel(&to->timer);
2987 		destroy_hrtimer_on_stack(&to->timer);
2988 	}
2989 	return ret;
2990 }
2991 
2992 
futex_wait_restart(struct restart_block * restart)2993 static long futex_wait_restart(struct restart_block *restart)
2994 {
2995 	u32 __user *uaddr = restart->futex.uaddr;
2996 	ktime_t t, *tp = NULL;
2997 
2998 	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2999 		t = restart->futex.time;
3000 		tp = &t;
3001 	}
3002 	restart->fn = do_no_restart_syscall;
3003 
3004 	return (long)futex_wait(uaddr, restart->futex.flags,
3005 				restart->futex.val, tp, restart->futex.bitset);
3006 }
3007 
3008 
3009 /*
3010  * Userspace tried a 0 -> TID atomic transition of the futex value
3011  * and failed. The kernel side here does the whole locking operation:
3012  * if there are waiters then it will block as a consequence of relying
3013  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
3014  * a 0 value of the futex too.).
3015  *
3016  * Also serves as futex trylock_pi()'ing, and due semantics.
3017  */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,ktime_t * time,int trylock)3018 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
3019 			 ktime_t *time, int trylock)
3020 {
3021 	struct hrtimer_sleeper timeout, *to;
3022 	struct task_struct *exiting = NULL;
3023 	struct rt_mutex_waiter rt_waiter;
3024 	struct futex_hash_bucket *hb;
3025 	struct futex_q q = futex_q_init;
3026 	int res, ret;
3027 
3028 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
3029 		return -ENOSYS;
3030 
3031 	if (refill_pi_state_cache())
3032 		return -ENOMEM;
3033 
3034 	to = futex_setup_timer(time, &timeout, flags, 0);
3035 
3036 retry:
3037 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
3038 	if (unlikely(ret != 0))
3039 		goto out;
3040 
3041 retry_private:
3042 	hb = queue_lock(&q);
3043 
3044 	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
3045 				   &exiting, 0);
3046 	if (unlikely(ret)) {
3047 		/*
3048 		 * Atomic work succeeded and we got the lock,
3049 		 * or failed. Either way, we do _not_ block.
3050 		 */
3051 		switch (ret) {
3052 		case 1:
3053 			/* We got the lock. */
3054 			ret = 0;
3055 			goto out_unlock_put_key;
3056 		case -EFAULT:
3057 			goto uaddr_faulted;
3058 		case -EBUSY:
3059 		case -EAGAIN:
3060 			/*
3061 			 * Two reasons for this:
3062 			 * - EBUSY: Task is exiting and we just wait for the
3063 			 *   exit to complete.
3064 			 * - EAGAIN: The user space value changed.
3065 			 */
3066 			queue_unlock(hb);
3067 			/*
3068 			 * Handle the case where the owner is in the middle of
3069 			 * exiting. Wait for the exit to complete otherwise
3070 			 * this task might loop forever, aka. live lock.
3071 			 */
3072 			wait_for_owner_exiting(ret, exiting);
3073 			cond_resched();
3074 			goto retry;
3075 		default:
3076 			goto out_unlock_put_key;
3077 		}
3078 	}
3079 
3080 	WARN_ON(!q.pi_state);
3081 
3082 	/*
3083 	 * Only actually queue now that the atomic ops are done:
3084 	 */
3085 	__queue_me(&q, hb);
3086 
3087 	if (trylock) {
3088 		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
3089 		/* Fixup the trylock return value: */
3090 		ret = ret ? 0 : -EWOULDBLOCK;
3091 		goto no_block;
3092 	}
3093 
3094 	rt_mutex_init_waiter(&rt_waiter);
3095 
3096 	/*
3097 	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
3098 	 * hold it while doing rt_mutex_start_proxy(), because then it will
3099 	 * include hb->lock in the blocking chain, even through we'll not in
3100 	 * fact hold it while blocking. This will lead it to report -EDEADLK
3101 	 * and BUG when futex_unlock_pi() interleaves with this.
3102 	 *
3103 	 * Therefore acquire wait_lock while holding hb->lock, but drop the
3104 	 * latter before calling __rt_mutex_start_proxy_lock(). This
3105 	 * interleaves with futex_unlock_pi() -- which does a similar lock
3106 	 * handoff -- such that the latter can observe the futex_q::pi_state
3107 	 * before __rt_mutex_start_proxy_lock() is done.
3108 	 */
3109 	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
3110 	spin_unlock(q.lock_ptr);
3111 	/*
3112 	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
3113 	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
3114 	 * it sees the futex_q::pi_state.
3115 	 */
3116 	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
3117 	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
3118 
3119 	if (ret) {
3120 		if (ret == 1)
3121 			ret = 0;
3122 		goto cleanup;
3123 	}
3124 
3125 	if (unlikely(to))
3126 		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
3127 
3128 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
3129 
3130 cleanup:
3131 	spin_lock(q.lock_ptr);
3132 	/*
3133 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
3134 	 * first acquire the hb->lock before removing the lock from the
3135 	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
3136 	 * lists consistent.
3137 	 *
3138 	 * In particular; it is important that futex_unlock_pi() can not
3139 	 * observe this inconsistency.
3140 	 */
3141 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
3142 		ret = 0;
3143 
3144 no_block:
3145 	/*
3146 	 * Fixup the pi_state owner and possibly acquire the lock if we
3147 	 * haven't already.
3148 	 */
3149 	res = fixup_owner(uaddr, &q, !ret);
3150 	/*
3151 	 * If fixup_owner() returned an error, propagate that.  If it acquired
3152 	 * the lock, clear our -ETIMEDOUT or -EINTR.
3153 	 */
3154 	if (res)
3155 		ret = (res < 0) ? res : 0;
3156 
3157 	unqueue_me_pi(&q);
3158 	spin_unlock(q.lock_ptr);
3159 	goto out;
3160 
3161 out_unlock_put_key:
3162 	queue_unlock(hb);
3163 
3164 out:
3165 	if (to) {
3166 		hrtimer_cancel(&to->timer);
3167 		destroy_hrtimer_on_stack(&to->timer);
3168 	}
3169 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
3170 
3171 uaddr_faulted:
3172 	queue_unlock(hb);
3173 
3174 	ret = fault_in_user_writeable(uaddr);
3175 	if (ret)
3176 		goto out;
3177 
3178 	if (!(flags & FLAGS_SHARED))
3179 		goto retry_private;
3180 
3181 	goto retry;
3182 }
3183 
3184 /*
3185  * Userspace attempted a TID -> 0 atomic transition, and failed.
3186  * This is the in-kernel slowpath: we look up the PI state (if any),
3187  * and do the rt-mutex unlock.
3188  */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)3189 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
3190 {
3191 	u32 curval, uval, vpid = task_pid_vnr(current);
3192 	union futex_key key = FUTEX_KEY_INIT;
3193 	struct futex_hash_bucket *hb;
3194 	struct futex_q *top_waiter;
3195 	int ret;
3196 
3197 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
3198 		return -ENOSYS;
3199 
3200 retry:
3201 	if (get_user(uval, uaddr))
3202 		return -EFAULT;
3203 	/*
3204 	 * We release only a lock we actually own:
3205 	 */
3206 	if ((uval & FUTEX_TID_MASK) != vpid)
3207 		return -EPERM;
3208 
3209 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
3210 	if (ret)
3211 		return ret;
3212 
3213 	hb = hash_futex(&key);
3214 	spin_lock(&hb->lock);
3215 
3216 	/*
3217 	 * Check waiters first. We do not trust user space values at
3218 	 * all and we at least want to know if user space fiddled
3219 	 * with the futex value instead of blindly unlocking.
3220 	 */
3221 	top_waiter = futex_top_waiter(hb, &key);
3222 	if (top_waiter) {
3223 		struct futex_pi_state *pi_state = top_waiter->pi_state;
3224 
3225 		ret = -EINVAL;
3226 		if (!pi_state)
3227 			goto out_unlock;
3228 
3229 		/*
3230 		 * If current does not own the pi_state then the futex is
3231 		 * inconsistent and user space fiddled with the futex value.
3232 		 */
3233 		if (pi_state->owner != current)
3234 			goto out_unlock;
3235 
3236 		get_pi_state(pi_state);
3237 		/*
3238 		 * By taking wait_lock while still holding hb->lock, we ensure
3239 		 * there is no point where we hold neither; and therefore
3240 		 * wake_futex_pi() must observe a state consistent with what we
3241 		 * observed.
3242 		 *
3243 		 * In particular; this forces __rt_mutex_start_proxy() to
3244 		 * complete such that we're guaranteed to observe the
3245 		 * rt_waiter. Also see the WARN in wake_futex_pi().
3246 		 */
3247 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3248 		spin_unlock(&hb->lock);
3249 
3250 		/* drops pi_state->pi_mutex.wait_lock */
3251 		ret = wake_futex_pi(uaddr, uval, pi_state);
3252 
3253 		put_pi_state(pi_state);
3254 
3255 		/*
3256 		 * Success, we're done! No tricky corner cases.
3257 		 */
3258 		if (!ret)
3259 			return ret;
3260 		/*
3261 		 * The atomic access to the futex value generated a
3262 		 * pagefault, so retry the user-access and the wakeup:
3263 		 */
3264 		if (ret == -EFAULT)
3265 			goto pi_faulted;
3266 		/*
3267 		 * A unconditional UNLOCK_PI op raced against a waiter
3268 		 * setting the FUTEX_WAITERS bit. Try again.
3269 		 */
3270 		if (ret == -EAGAIN)
3271 			goto pi_retry;
3272 		/*
3273 		 * wake_futex_pi has detected invalid state. Tell user
3274 		 * space.
3275 		 */
3276 		return ret;
3277 	}
3278 
3279 	/*
3280 	 * We have no kernel internal state, i.e. no waiters in the
3281 	 * kernel. Waiters which are about to queue themselves are stuck
3282 	 * on hb->lock. So we can safely ignore them. We do neither
3283 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
3284 	 * owner.
3285 	 */
3286 	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3287 		spin_unlock(&hb->lock);
3288 		switch (ret) {
3289 		case -EFAULT:
3290 			goto pi_faulted;
3291 
3292 		case -EAGAIN:
3293 			goto pi_retry;
3294 
3295 		default:
3296 			WARN_ON_ONCE(1);
3297 			return ret;
3298 		}
3299 	}
3300 
3301 	/*
3302 	 * If uval has changed, let user space handle it.
3303 	 */
3304 	ret = (curval == uval) ? 0 : -EAGAIN;
3305 
3306 out_unlock:
3307 	spin_unlock(&hb->lock);
3308 	return ret;
3309 
3310 pi_retry:
3311 	cond_resched();
3312 	goto retry;
3313 
3314 pi_faulted:
3315 
3316 	ret = fault_in_user_writeable(uaddr);
3317 	if (!ret)
3318 		goto retry;
3319 
3320 	return ret;
3321 }
3322 
3323 /**
3324  * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
3325  * @hb:		the hash_bucket futex_q was original enqueued on
3326  * @q:		the futex_q woken while waiting to be requeued
3327  * @timeout:	the timeout associated with the wait (NULL if none)
3328  *
3329  * Determine the cause for the early wakeup.
3330  *
3331  * Return:
3332  *  -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
3333  */
3334 static inline
handle_early_requeue_pi_wakeup(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)3335 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3336 				   struct futex_q *q,
3337 				   struct hrtimer_sleeper *timeout)
3338 {
3339 	int ret;
3340 
3341 	/*
3342 	 * With the hb lock held, we avoid races while we process the wakeup.
3343 	 * We only need to hold hb (and not hb2) to ensure atomicity as the
3344 	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3345 	 * It can't be requeued from uaddr2 to something else since we don't
3346 	 * support a PI aware source futex for requeue.
3347 	 */
3348 	WARN_ON_ONCE(&hb->lock != q->lock_ptr);
3349 
3350 	/*
3351 	 * We were woken prior to requeue by a timeout or a signal.
3352 	 * Unqueue the futex_q and determine which it was.
3353 	 */
3354 	plist_del(&q->list, &hb->chain);
3355 	hb_waiters_dec(hb);
3356 
3357 	/* Handle spurious wakeups gracefully */
3358 	ret = -EWOULDBLOCK;
3359 	if (timeout && !timeout->task)
3360 		ret = -ETIMEDOUT;
3361 	else if (signal_pending(current))
3362 		ret = -ERESTARTNOINTR;
3363 	return ret;
3364 }
3365 
3366 /**
3367  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3368  * @uaddr:	the futex we initially wait on (non-pi)
3369  * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3370  *		the same type, no requeueing from private to shared, etc.
3371  * @val:	the expected value of uaddr
3372  * @abs_time:	absolute timeout
3373  * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
3374  * @uaddr2:	the pi futex we will take prior to returning to user-space
3375  *
3376  * The caller will wait on uaddr and will be requeued by futex_requeue() to
3377  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
3378  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3379  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
3380  * without one, the pi logic would not know which task to boost/deboost, if
3381  * there was a need to.
3382  *
3383  * We call schedule in futex_wait_queue_me() when we enqueue and return there
3384  * via the following--
3385  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3386  * 2) wakeup on uaddr2 after a requeue
3387  * 3) signal
3388  * 4) timeout
3389  *
3390  * If 3, cleanup and return -ERESTARTNOINTR.
3391  *
3392  * If 2, we may then block on trying to take the rt_mutex and return via:
3393  * 5) successful lock
3394  * 6) signal
3395  * 7) timeout
3396  * 8) other lock acquisition failure
3397  *
3398  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3399  *
3400  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3401  *
3402  * Return:
3403  *  -  0 - On success;
3404  *  - <0 - On error
3405  */
futex_wait_requeue_pi(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset,u32 __user * uaddr2)3406 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3407 				 u32 val, ktime_t *abs_time, u32 bitset,
3408 				 u32 __user *uaddr2)
3409 {
3410 	struct hrtimer_sleeper timeout, *to;
3411 	struct rt_mutex_waiter rt_waiter;
3412 	struct futex_hash_bucket *hb;
3413 	union futex_key key2 = FUTEX_KEY_INIT;
3414 	struct futex_q q = futex_q_init;
3415 	struct rt_mutex_base *pi_mutex;
3416 	int res, ret;
3417 
3418 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
3419 		return -ENOSYS;
3420 
3421 	if (uaddr == uaddr2)
3422 		return -EINVAL;
3423 
3424 	if (!bitset)
3425 		return -EINVAL;
3426 
3427 	to = futex_setup_timer(abs_time, &timeout, flags,
3428 			       current->timer_slack_ns);
3429 
3430 	/*
3431 	 * The waiter is allocated on our stack, manipulated by the requeue
3432 	 * code while we sleep on uaddr.
3433 	 */
3434 	rt_mutex_init_waiter(&rt_waiter);
3435 
3436 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
3437 	if (unlikely(ret != 0))
3438 		goto out;
3439 
3440 	q.bitset = bitset;
3441 	q.rt_waiter = &rt_waiter;
3442 	q.requeue_pi_key = &key2;
3443 
3444 	/*
3445 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
3446 	 * is initialized.
3447 	 */
3448 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3449 	if (ret)
3450 		goto out;
3451 
3452 	/*
3453 	 * The check above which compares uaddrs is not sufficient for
3454 	 * shared futexes. We need to compare the keys:
3455 	 */
3456 	if (match_futex(&q.key, &key2)) {
3457 		queue_unlock(hb);
3458 		ret = -EINVAL;
3459 		goto out;
3460 	}
3461 
3462 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
3463 	futex_wait_queue_me(hb, &q, to);
3464 
3465 	switch (futex_requeue_pi_wakeup_sync(&q)) {
3466 	case Q_REQUEUE_PI_IGNORE:
3467 		/* The waiter is still on uaddr1 */
3468 		spin_lock(&hb->lock);
3469 		ret = handle_early_requeue_pi_wakeup(hb, &q, to);
3470 		spin_unlock(&hb->lock);
3471 		break;
3472 
3473 	case Q_REQUEUE_PI_LOCKED:
3474 		/* The requeue acquired the lock */
3475 		if (q.pi_state && (q.pi_state->owner != current)) {
3476 			spin_lock(q.lock_ptr);
3477 			ret = fixup_owner(uaddr2, &q, true);
3478 			/*
3479 			 * Drop the reference to the pi state which the
3480 			 * requeue_pi() code acquired for us.
3481 			 */
3482 			put_pi_state(q.pi_state);
3483 			spin_unlock(q.lock_ptr);
3484 			/*
3485 			 * Adjust the return value. It's either -EFAULT or
3486 			 * success (1) but the caller expects 0 for success.
3487 			 */
3488 			ret = ret < 0 ? ret : 0;
3489 		}
3490 		break;
3491 
3492 	case Q_REQUEUE_PI_DONE:
3493 		/* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
3494 		pi_mutex = &q.pi_state->pi_mutex;
3495 		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3496 
3497 		/* Current is not longer pi_blocked_on */
3498 		spin_lock(q.lock_ptr);
3499 		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3500 			ret = 0;
3501 
3502 		debug_rt_mutex_free_waiter(&rt_waiter);
3503 		/*
3504 		 * Fixup the pi_state owner and possibly acquire the lock if we
3505 		 * haven't already.
3506 		 */
3507 		res = fixup_owner(uaddr2, &q, !ret);
3508 		/*
3509 		 * If fixup_owner() returned an error, propagate that.  If it
3510 		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
3511 		 */
3512 		if (res)
3513 			ret = (res < 0) ? res : 0;
3514 
3515 		unqueue_me_pi(&q);
3516 		spin_unlock(q.lock_ptr);
3517 
3518 		if (ret == -EINTR) {
3519 			/*
3520 			 * We've already been requeued, but cannot restart
3521 			 * by calling futex_lock_pi() directly. We could
3522 			 * restart this syscall, but it would detect that
3523 			 * the user space "val" changed and return
3524 			 * -EWOULDBLOCK.  Save the overhead of the restart
3525 			 * and return -EWOULDBLOCK directly.
3526 			 */
3527 			ret = -EWOULDBLOCK;
3528 		}
3529 		break;
3530 	default:
3531 		BUG();
3532 	}
3533 
3534 out:
3535 	if (to) {
3536 		hrtimer_cancel(&to->timer);
3537 		destroy_hrtimer_on_stack(&to->timer);
3538 	}
3539 	return ret;
3540 }
3541 
3542 /*
3543  * Support for robust futexes: the kernel cleans up held futexes at
3544  * thread exit time.
3545  *
3546  * Implementation: user-space maintains a per-thread list of locks it
3547  * is holding. Upon do_exit(), the kernel carefully walks this list,
3548  * and marks all locks that are owned by this thread with the
3549  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3550  * always manipulated with the lock held, so the list is private and
3551  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3552  * field, to allow the kernel to clean up if the thread dies after
3553  * acquiring the lock, but just before it could have added itself to
3554  * the list. There can only be one such pending lock.
3555  */
3556 
3557 /**
3558  * sys_set_robust_list() - Set the robust-futex list head of a task
3559  * @head:	pointer to the list-head
3560  * @len:	length of the list-head, as userspace expects
3561  */
SYSCALL_DEFINE2(set_robust_list,struct robust_list_head __user *,head,size_t,len)3562 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3563 		size_t, len)
3564 {
3565 	if (!futex_cmpxchg_enabled)
3566 		return -ENOSYS;
3567 	/*
3568 	 * The kernel knows only one size for now:
3569 	 */
3570 	if (unlikely(len != sizeof(*head)))
3571 		return -EINVAL;
3572 
3573 	current->robust_list = head;
3574 
3575 	return 0;
3576 }
3577 
3578 /**
3579  * sys_get_robust_list() - Get the robust-futex list head of a task
3580  * @pid:	pid of the process [zero for current task]
3581  * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
3582  * @len_ptr:	pointer to a length field, the kernel fills in the header size
3583  */
SYSCALL_DEFINE3(get_robust_list,int,pid,struct robust_list_head __user * __user *,head_ptr,size_t __user *,len_ptr)3584 SYSCALL_DEFINE3(get_robust_list, int, pid,
3585 		struct robust_list_head __user * __user *, head_ptr,
3586 		size_t __user *, len_ptr)
3587 {
3588 	struct robust_list_head __user *head;
3589 	unsigned long ret;
3590 	struct task_struct *p;
3591 
3592 	if (!futex_cmpxchg_enabled)
3593 		return -ENOSYS;
3594 
3595 	rcu_read_lock();
3596 
3597 	ret = -ESRCH;
3598 	if (!pid)
3599 		p = current;
3600 	else {
3601 		p = find_task_by_vpid(pid);
3602 		if (!p)
3603 			goto err_unlock;
3604 	}
3605 
3606 	ret = -EPERM;
3607 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3608 		goto err_unlock;
3609 
3610 	head = p->robust_list;
3611 	rcu_read_unlock();
3612 
3613 	if (put_user(sizeof(*head), len_ptr))
3614 		return -EFAULT;
3615 	return put_user(head, head_ptr);
3616 
3617 err_unlock:
3618 	rcu_read_unlock();
3619 
3620 	return ret;
3621 }
3622 
3623 /* Constants for the pending_op argument of handle_futex_death */
3624 #define HANDLE_DEATH_PENDING	true
3625 #define HANDLE_DEATH_LIST	false
3626 
3627 /*
3628  * Process a futex-list entry, check whether it's owned by the
3629  * dying task, and do notification if so:
3630  */
handle_futex_death(u32 __user * uaddr,struct task_struct * curr,bool pi,bool pending_op)3631 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3632 			      bool pi, bool pending_op)
3633 {
3634 	u32 uval, nval, mval;
3635 	int err;
3636 
3637 	/* Futex address must be 32bit aligned */
3638 	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3639 		return -1;
3640 
3641 retry:
3642 	if (get_user(uval, uaddr))
3643 		return -1;
3644 
3645 	/*
3646 	 * Special case for regular (non PI) futexes. The unlock path in
3647 	 * user space has two race scenarios:
3648 	 *
3649 	 * 1. The unlock path releases the user space futex value and
3650 	 *    before it can execute the futex() syscall to wake up
3651 	 *    waiters it is killed.
3652 	 *
3653 	 * 2. A woken up waiter is killed before it can acquire the
3654 	 *    futex in user space.
3655 	 *
3656 	 * In both cases the TID validation below prevents a wakeup of
3657 	 * potential waiters which can cause these waiters to block
3658 	 * forever.
3659 	 *
3660 	 * In both cases the following conditions are met:
3661 	 *
3662 	 *	1) task->robust_list->list_op_pending != NULL
3663 	 *	   @pending_op == true
3664 	 *	2) User space futex value == 0
3665 	 *	3) Regular futex: @pi == false
3666 	 *
3667 	 * If these conditions are met, it is safe to attempt waking up a
3668 	 * potential waiter without touching the user space futex value and
3669 	 * trying to set the OWNER_DIED bit. The user space futex value is
3670 	 * uncontended and the rest of the user space mutex state is
3671 	 * consistent, so a woken waiter will just take over the
3672 	 * uncontended futex. Setting the OWNER_DIED bit would create
3673 	 * inconsistent state and malfunction of the user space owner died
3674 	 * handling.
3675 	 */
3676 	if (pending_op && !pi && !uval) {
3677 		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3678 		return 0;
3679 	}
3680 
3681 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3682 		return 0;
3683 
3684 	/*
3685 	 * Ok, this dying thread is truly holding a futex
3686 	 * of interest. Set the OWNER_DIED bit atomically
3687 	 * via cmpxchg, and if the value had FUTEX_WAITERS
3688 	 * set, wake up a waiter (if any). (We have to do a
3689 	 * futex_wake() even if OWNER_DIED is already set -
3690 	 * to handle the rare but possible case of recursive
3691 	 * thread-death.) The rest of the cleanup is done in
3692 	 * userspace.
3693 	 */
3694 	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3695 
3696 	/*
3697 	 * We are not holding a lock here, but we want to have
3698 	 * the pagefault_disable/enable() protection because
3699 	 * we want to handle the fault gracefully. If the
3700 	 * access fails we try to fault in the futex with R/W
3701 	 * verification via get_user_pages. get_user() above
3702 	 * does not guarantee R/W access. If that fails we
3703 	 * give up and leave the futex locked.
3704 	 */
3705 	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3706 		switch (err) {
3707 		case -EFAULT:
3708 			if (fault_in_user_writeable(uaddr))
3709 				return -1;
3710 			goto retry;
3711 
3712 		case -EAGAIN:
3713 			cond_resched();
3714 			goto retry;
3715 
3716 		default:
3717 			WARN_ON_ONCE(1);
3718 			return err;
3719 		}
3720 	}
3721 
3722 	if (nval != uval)
3723 		goto retry;
3724 
3725 	/*
3726 	 * Wake robust non-PI futexes here. The wakeup of
3727 	 * PI futexes happens in exit_pi_state():
3728 	 */
3729 	if (!pi && (uval & FUTEX_WAITERS))
3730 		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3731 
3732 	return 0;
3733 }
3734 
3735 /*
3736  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3737  */
fetch_robust_entry(struct robust_list __user ** entry,struct robust_list __user * __user * head,unsigned int * pi)3738 static inline int fetch_robust_entry(struct robust_list __user **entry,
3739 				     struct robust_list __user * __user *head,
3740 				     unsigned int *pi)
3741 {
3742 	unsigned long uentry;
3743 
3744 	if (get_user(uentry, (unsigned long __user *)head))
3745 		return -EFAULT;
3746 
3747 	*entry = (void __user *)(uentry & ~1UL);
3748 	*pi = uentry & 1;
3749 
3750 	return 0;
3751 }
3752 
3753 /*
3754  * Walk curr->robust_list (very carefully, it's a userspace list!)
3755  * and mark any locks found there dead, and notify any waiters.
3756  *
3757  * We silently return on any sign of list-walking problem.
3758  */
exit_robust_list(struct task_struct * curr)3759 static void exit_robust_list(struct task_struct *curr)
3760 {
3761 	struct robust_list_head __user *head = curr->robust_list;
3762 	struct robust_list __user *entry, *next_entry, *pending;
3763 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3764 	unsigned int next_pi;
3765 	unsigned long futex_offset;
3766 	int rc;
3767 
3768 	if (!futex_cmpxchg_enabled)
3769 		return;
3770 
3771 	/*
3772 	 * Fetch the list head (which was registered earlier, via
3773 	 * sys_set_robust_list()):
3774 	 */
3775 	if (fetch_robust_entry(&entry, &head->list.next, &pi))
3776 		return;
3777 	/*
3778 	 * Fetch the relative futex offset:
3779 	 */
3780 	if (get_user(futex_offset, &head->futex_offset))
3781 		return;
3782 	/*
3783 	 * Fetch any possibly pending lock-add first, and handle it
3784 	 * if it exists:
3785 	 */
3786 	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3787 		return;
3788 
3789 	next_entry = NULL;	/* avoid warning with gcc */
3790 	while (entry != &head->list) {
3791 		/*
3792 		 * Fetch the next entry in the list before calling
3793 		 * handle_futex_death:
3794 		 */
3795 		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3796 		/*
3797 		 * A pending lock might already be on the list, so
3798 		 * don't process it twice:
3799 		 */
3800 		if (entry != pending) {
3801 			if (handle_futex_death((void __user *)entry + futex_offset,
3802 						curr, pi, HANDLE_DEATH_LIST))
3803 				return;
3804 		}
3805 		if (rc)
3806 			return;
3807 		entry = next_entry;
3808 		pi = next_pi;
3809 		/*
3810 		 * Avoid excessively long or circular lists:
3811 		 */
3812 		if (!--limit)
3813 			break;
3814 
3815 		cond_resched();
3816 	}
3817 
3818 	if (pending) {
3819 		handle_futex_death((void __user *)pending + futex_offset,
3820 				   curr, pip, HANDLE_DEATH_PENDING);
3821 	}
3822 }
3823 
futex_cleanup(struct task_struct * tsk)3824 static void futex_cleanup(struct task_struct *tsk)
3825 {
3826 	if (unlikely(tsk->robust_list)) {
3827 		exit_robust_list(tsk);
3828 		tsk->robust_list = NULL;
3829 	}
3830 
3831 #ifdef CONFIG_COMPAT
3832 	if (unlikely(tsk->compat_robust_list)) {
3833 		compat_exit_robust_list(tsk);
3834 		tsk->compat_robust_list = NULL;
3835 	}
3836 #endif
3837 
3838 	if (unlikely(!list_empty(&tsk->pi_state_list)))
3839 		exit_pi_state_list(tsk);
3840 }
3841 
3842 /**
3843  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3844  * @tsk:	task to set the state on
3845  *
3846  * Set the futex exit state of the task lockless. The futex waiter code
3847  * observes that state when a task is exiting and loops until the task has
3848  * actually finished the futex cleanup. The worst case for this is that the
3849  * waiter runs through the wait loop until the state becomes visible.
3850  *
3851  * This is called from the recursive fault handling path in do_exit().
3852  *
3853  * This is best effort. Either the futex exit code has run already or
3854  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3855  * take it over. If not, the problem is pushed back to user space. If the
3856  * futex exit code did not run yet, then an already queued waiter might
3857  * block forever, but there is nothing which can be done about that.
3858  */
futex_exit_recursive(struct task_struct * tsk)3859 void futex_exit_recursive(struct task_struct *tsk)
3860 {
3861 	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3862 	if (tsk->futex_state == FUTEX_STATE_EXITING)
3863 		mutex_unlock(&tsk->futex_exit_mutex);
3864 	tsk->futex_state = FUTEX_STATE_DEAD;
3865 }
3866 
futex_cleanup_begin(struct task_struct * tsk)3867 static void futex_cleanup_begin(struct task_struct *tsk)
3868 {
3869 	/*
3870 	 * Prevent various race issues against a concurrent incoming waiter
3871 	 * including live locks by forcing the waiter to block on
3872 	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3873 	 * attach_to_pi_owner().
3874 	 */
3875 	mutex_lock(&tsk->futex_exit_mutex);
3876 
3877 	/*
3878 	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3879 	 *
3880 	 * This ensures that all subsequent checks of tsk->futex_state in
3881 	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3882 	 * tsk->pi_lock held.
3883 	 *
3884 	 * It guarantees also that a pi_state which was queued right before
3885 	 * the state change under tsk->pi_lock by a concurrent waiter must
3886 	 * be observed in exit_pi_state_list().
3887 	 */
3888 	raw_spin_lock_irq(&tsk->pi_lock);
3889 	tsk->futex_state = FUTEX_STATE_EXITING;
3890 	raw_spin_unlock_irq(&tsk->pi_lock);
3891 }
3892 
futex_cleanup_end(struct task_struct * tsk,int state)3893 static void futex_cleanup_end(struct task_struct *tsk, int state)
3894 {
3895 	/*
3896 	 * Lockless store. The only side effect is that an observer might
3897 	 * take another loop until it becomes visible.
3898 	 */
3899 	tsk->futex_state = state;
3900 	/*
3901 	 * Drop the exit protection. This unblocks waiters which observed
3902 	 * FUTEX_STATE_EXITING to reevaluate the state.
3903 	 */
3904 	mutex_unlock(&tsk->futex_exit_mutex);
3905 }
3906 
futex_exec_release(struct task_struct * tsk)3907 void futex_exec_release(struct task_struct *tsk)
3908 {
3909 	/*
3910 	 * The state handling is done for consistency, but in the case of
3911 	 * exec() there is no way to prevent further damage as the PID stays
3912 	 * the same. But for the unlikely and arguably buggy case that a
3913 	 * futex is held on exec(), this provides at least as much state
3914 	 * consistency protection which is possible.
3915 	 */
3916 	futex_cleanup_begin(tsk);
3917 	futex_cleanup(tsk);
3918 	/*
3919 	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
3920 	 * exec a new binary.
3921 	 */
3922 	futex_cleanup_end(tsk, FUTEX_STATE_OK);
3923 }
3924 
futex_exit_release(struct task_struct * tsk)3925 void futex_exit_release(struct task_struct *tsk)
3926 {
3927 	futex_cleanup_begin(tsk);
3928 	futex_cleanup(tsk);
3929 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3930 }
3931 
do_futex(u32 __user * uaddr,int op,u32 val,ktime_t * timeout,u32 __user * uaddr2,u32 val2,u32 val3)3932 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3933 		u32 __user *uaddr2, u32 val2, u32 val3)
3934 {
3935 	int cmd = op & FUTEX_CMD_MASK;
3936 	unsigned int flags = 0;
3937 
3938 	if (!(op & FUTEX_PRIVATE_FLAG))
3939 		flags |= FLAGS_SHARED;
3940 
3941 	if (op & FUTEX_CLOCK_REALTIME) {
3942 		flags |= FLAGS_CLOCKRT;
3943 		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
3944 		    cmd != FUTEX_LOCK_PI2)
3945 			return -ENOSYS;
3946 	}
3947 
3948 	switch (cmd) {
3949 	case FUTEX_LOCK_PI:
3950 	case FUTEX_LOCK_PI2:
3951 	case FUTEX_UNLOCK_PI:
3952 	case FUTEX_TRYLOCK_PI:
3953 	case FUTEX_WAIT_REQUEUE_PI:
3954 	case FUTEX_CMP_REQUEUE_PI:
3955 		if (!futex_cmpxchg_enabled)
3956 			return -ENOSYS;
3957 	}
3958 
3959 	switch (cmd) {
3960 	case FUTEX_WAIT:
3961 		val3 = FUTEX_BITSET_MATCH_ANY;
3962 		fallthrough;
3963 	case FUTEX_WAIT_BITSET:
3964 		return futex_wait(uaddr, flags, val, timeout, val3);
3965 	case FUTEX_WAKE:
3966 		val3 = FUTEX_BITSET_MATCH_ANY;
3967 		fallthrough;
3968 	case FUTEX_WAKE_BITSET:
3969 		return futex_wake(uaddr, flags, val, val3);
3970 	case FUTEX_REQUEUE:
3971 		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3972 	case FUTEX_CMP_REQUEUE:
3973 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3974 	case FUTEX_WAKE_OP:
3975 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3976 	case FUTEX_LOCK_PI:
3977 		flags |= FLAGS_CLOCKRT;
3978 		fallthrough;
3979 	case FUTEX_LOCK_PI2:
3980 		return futex_lock_pi(uaddr, flags, timeout, 0);
3981 	case FUTEX_UNLOCK_PI:
3982 		return futex_unlock_pi(uaddr, flags);
3983 	case FUTEX_TRYLOCK_PI:
3984 		return futex_lock_pi(uaddr, flags, NULL, 1);
3985 	case FUTEX_WAIT_REQUEUE_PI:
3986 		val3 = FUTEX_BITSET_MATCH_ANY;
3987 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3988 					     uaddr2);
3989 	case FUTEX_CMP_REQUEUE_PI:
3990 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3991 	}
3992 	return -ENOSYS;
3993 }
3994 
futex_cmd_has_timeout(u32 cmd)3995 static __always_inline bool futex_cmd_has_timeout(u32 cmd)
3996 {
3997 	switch (cmd) {
3998 	case FUTEX_WAIT:
3999 	case FUTEX_LOCK_PI:
4000 	case FUTEX_LOCK_PI2:
4001 	case FUTEX_WAIT_BITSET:
4002 	case FUTEX_WAIT_REQUEUE_PI:
4003 		return true;
4004 	}
4005 	return false;
4006 }
4007 
4008 static __always_inline int
futex_init_timeout(u32 cmd,u32 op,struct timespec64 * ts,ktime_t * t)4009 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
4010 {
4011 	if (!timespec64_valid(ts))
4012 		return -EINVAL;
4013 
4014 	*t = timespec64_to_ktime(*ts);
4015 	if (cmd == FUTEX_WAIT)
4016 		*t = ktime_add_safe(ktime_get(), *t);
4017 	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
4018 		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
4019 	return 0;
4020 }
4021 
SYSCALL_DEFINE6(futex,u32 __user *,uaddr,int,op,u32,val,const struct __kernel_timespec __user *,utime,u32 __user *,uaddr2,u32,val3)4022 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
4023 		const struct __kernel_timespec __user *, utime,
4024 		u32 __user *, uaddr2, u32, val3)
4025 {
4026 	int ret, cmd = op & FUTEX_CMD_MASK;
4027 	ktime_t t, *tp = NULL;
4028 	struct timespec64 ts;
4029 
4030 	if (utime && futex_cmd_has_timeout(cmd)) {
4031 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
4032 			return -EFAULT;
4033 		if (get_timespec64(&ts, utime))
4034 			return -EFAULT;
4035 		ret = futex_init_timeout(cmd, op, &ts, &t);
4036 		if (ret)
4037 			return ret;
4038 		tp = &t;
4039 	}
4040 
4041 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
4042 }
4043 
4044 #ifdef CONFIG_COMPAT
4045 /*
4046  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
4047  */
4048 static inline int
compat_fetch_robust_entry(compat_uptr_t * uentry,struct robust_list __user ** entry,compat_uptr_t __user * head,unsigned int * pi)4049 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
4050 		   compat_uptr_t __user *head, unsigned int *pi)
4051 {
4052 	if (get_user(*uentry, head))
4053 		return -EFAULT;
4054 
4055 	*entry = compat_ptr((*uentry) & ~1);
4056 	*pi = (unsigned int)(*uentry) & 1;
4057 
4058 	return 0;
4059 }
4060 
futex_uaddr(struct robust_list __user * entry,compat_long_t futex_offset)4061 static void __user *futex_uaddr(struct robust_list __user *entry,
4062 				compat_long_t futex_offset)
4063 {
4064 	compat_uptr_t base = ptr_to_compat(entry);
4065 	void __user *uaddr = compat_ptr(base + futex_offset);
4066 
4067 	return uaddr;
4068 }
4069 
4070 /*
4071  * Walk curr->robust_list (very carefully, it's a userspace list!)
4072  * and mark any locks found there dead, and notify any waiters.
4073  *
4074  * We silently return on any sign of list-walking problem.
4075  */
compat_exit_robust_list(struct task_struct * curr)4076 static void compat_exit_robust_list(struct task_struct *curr)
4077 {
4078 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
4079 	struct robust_list __user *entry, *next_entry, *pending;
4080 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
4081 	unsigned int next_pi;
4082 	compat_uptr_t uentry, next_uentry, upending;
4083 	compat_long_t futex_offset;
4084 	int rc;
4085 
4086 	if (!futex_cmpxchg_enabled)
4087 		return;
4088 
4089 	/*
4090 	 * Fetch the list head (which was registered earlier, via
4091 	 * sys_set_robust_list()):
4092 	 */
4093 	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
4094 		return;
4095 	/*
4096 	 * Fetch the relative futex offset:
4097 	 */
4098 	if (get_user(futex_offset, &head->futex_offset))
4099 		return;
4100 	/*
4101 	 * Fetch any possibly pending lock-add first, and handle it
4102 	 * if it exists:
4103 	 */
4104 	if (compat_fetch_robust_entry(&upending, &pending,
4105 			       &head->list_op_pending, &pip))
4106 		return;
4107 
4108 	next_entry = NULL;	/* avoid warning with gcc */
4109 	while (entry != (struct robust_list __user *) &head->list) {
4110 		/*
4111 		 * Fetch the next entry in the list before calling
4112 		 * handle_futex_death:
4113 		 */
4114 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
4115 			(compat_uptr_t __user *)&entry->next, &next_pi);
4116 		/*
4117 		 * A pending lock might already be on the list, so
4118 		 * dont process it twice:
4119 		 */
4120 		if (entry != pending) {
4121 			void __user *uaddr = futex_uaddr(entry, futex_offset);
4122 
4123 			if (handle_futex_death(uaddr, curr, pi,
4124 					       HANDLE_DEATH_LIST))
4125 				return;
4126 		}
4127 		if (rc)
4128 			return;
4129 		uentry = next_uentry;
4130 		entry = next_entry;
4131 		pi = next_pi;
4132 		/*
4133 		 * Avoid excessively long or circular lists:
4134 		 */
4135 		if (!--limit)
4136 			break;
4137 
4138 		cond_resched();
4139 	}
4140 	if (pending) {
4141 		void __user *uaddr = futex_uaddr(pending, futex_offset);
4142 
4143 		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
4144 	}
4145 }
4146 
COMPAT_SYSCALL_DEFINE2(set_robust_list,struct compat_robust_list_head __user *,head,compat_size_t,len)4147 COMPAT_SYSCALL_DEFINE2(set_robust_list,
4148 		struct compat_robust_list_head __user *, head,
4149 		compat_size_t, len)
4150 {
4151 	if (!futex_cmpxchg_enabled)
4152 		return -ENOSYS;
4153 
4154 	if (unlikely(len != sizeof(*head)))
4155 		return -EINVAL;
4156 
4157 	current->compat_robust_list = head;
4158 
4159 	return 0;
4160 }
4161 
COMPAT_SYSCALL_DEFINE3(get_robust_list,int,pid,compat_uptr_t __user *,head_ptr,compat_size_t __user *,len_ptr)4162 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
4163 			compat_uptr_t __user *, head_ptr,
4164 			compat_size_t __user *, len_ptr)
4165 {
4166 	struct compat_robust_list_head __user *head;
4167 	unsigned long ret;
4168 	struct task_struct *p;
4169 
4170 	if (!futex_cmpxchg_enabled)
4171 		return -ENOSYS;
4172 
4173 	rcu_read_lock();
4174 
4175 	ret = -ESRCH;
4176 	if (!pid)
4177 		p = current;
4178 	else {
4179 		p = find_task_by_vpid(pid);
4180 		if (!p)
4181 			goto err_unlock;
4182 	}
4183 
4184 	ret = -EPERM;
4185 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
4186 		goto err_unlock;
4187 
4188 	head = p->compat_robust_list;
4189 	rcu_read_unlock();
4190 
4191 	if (put_user(sizeof(*head), len_ptr))
4192 		return -EFAULT;
4193 	return put_user(ptr_to_compat(head), head_ptr);
4194 
4195 err_unlock:
4196 	rcu_read_unlock();
4197 
4198 	return ret;
4199 }
4200 #endif /* CONFIG_COMPAT */
4201 
4202 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32,u32 __user *,uaddr,int,op,u32,val,const struct old_timespec32 __user *,utime,u32 __user *,uaddr2,u32,val3)4203 SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
4204 		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
4205 		u32, val3)
4206 {
4207 	int ret, cmd = op & FUTEX_CMD_MASK;
4208 	ktime_t t, *tp = NULL;
4209 	struct timespec64 ts;
4210 
4211 	if (utime && futex_cmd_has_timeout(cmd)) {
4212 		if (get_old_timespec32(&ts, utime))
4213 			return -EFAULT;
4214 		ret = futex_init_timeout(cmd, op, &ts, &t);
4215 		if (ret)
4216 			return ret;
4217 		tp = &t;
4218 	}
4219 
4220 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
4221 }
4222 #endif /* CONFIG_COMPAT_32BIT_TIME */
4223 
futex_detect_cmpxchg(void)4224 static void __init futex_detect_cmpxchg(void)
4225 {
4226 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
4227 	u32 curval;
4228 
4229 	/*
4230 	 * This will fail and we want it. Some arch implementations do
4231 	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
4232 	 * functionality. We want to know that before we call in any
4233 	 * of the complex code paths. Also we want to prevent
4234 	 * registration of robust lists in that case. NULL is
4235 	 * guaranteed to fault and we get -EFAULT on functional
4236 	 * implementation, the non-functional ones will return
4237 	 * -ENOSYS.
4238 	 */
4239 	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4240 		futex_cmpxchg_enabled = 1;
4241 #endif
4242 }
4243 
futex_init(void)4244 static int __init futex_init(void)
4245 {
4246 	unsigned int futex_shift;
4247 	unsigned long i;
4248 
4249 #if CONFIG_BASE_SMALL
4250 	futex_hashsize = 16;
4251 #else
4252 	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4253 #endif
4254 
4255 	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4256 					       futex_hashsize, 0,
4257 					       futex_hashsize < 256 ? HASH_SMALL : 0,
4258 					       &futex_shift, NULL,
4259 					       futex_hashsize, futex_hashsize);
4260 	futex_hashsize = 1UL << futex_shift;
4261 
4262 	futex_detect_cmpxchg();
4263 
4264 	for (i = 0; i < futex_hashsize; i++) {
4265 		atomic_set(&futex_queues[i].waiters, 0);
4266 		plist_head_init(&futex_queues[i].chain);
4267 		spin_lock_init(&futex_queues[i].lock);
4268 	}
4269 
4270 	return 0;
4271 }
4272 core_initcall(futex_init);
4273