1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Shared application/kernel submission and completion ring pairs, for
4  * supporting fast/efficient IO.
5  *
6  * A note on the read/write ordering memory barriers that are matched between
7  * the application and kernel side.
8  *
9  * After the application reads the CQ ring tail, it must use an
10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11  * before writing the tail (using smp_load_acquire to read the tail will
12  * do). It also needs a smp_mb() before updating CQ head (ordering the
13  * entry load(s) with the head store), pairing with an implicit barrier
14  * through a control-dependency in io_get_cqring (smp_store_release to
15  * store head will do). Failure to do so could lead to reading invalid
16  * CQ entries.
17  *
18  * Likewise, the application must use an appropriate smp_wmb() before
19  * writing the SQ tail (ordering SQ entry stores with the tail store),
20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21  * to store the tail will do). And it needs a barrier ordering the SQ
22  * head load before writing new SQ entries (smp_load_acquire to read
23  * head will do).
24  *
25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27  * updating the SQ tail; a full memory barrier smp_mb() is needed
28  * between.
29  *
30  * Also see the examples in the liburing library:
31  *
32  *	git://git.kernel.dk/liburing
33  *
34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35  * from data shared between the kernel and application. This is done both
36  * for ordering purposes, but also to ensure that once a value is loaded from
37  * data that the application could potentially modify, it remains stable.
38  *
39  * Copyright (C) 2018-2019 Jens Axboe
40  * Copyright (c) 2018-2019 Christoph Hellwig
41  */
42 #include <linux/kernel.h>
43 #include <linux/init.h>
44 #include <linux/errno.h>
45 #include <linux/syscalls.h>
46 #include <linux/compat.h>
47 #include <net/compat.h>
48 #include <linux/refcount.h>
49 #include <linux/uio.h>
50 #include <linux/bits.h>
51 
52 #include <linux/sched/signal.h>
53 #include <linux/fs.h>
54 #include <linux/file.h>
55 #include <linux/fdtable.h>
56 #include <linux/mm.h>
57 #include <linux/mman.h>
58 #include <linux/percpu.h>
59 #include <linux/slab.h>
60 #include <linux/kthread.h>
61 #include <linux/blkdev.h>
62 #include <linux/bvec.h>
63 #include <linux/net.h>
64 #include <net/sock.h>
65 #include <net/af_unix.h>
66 #include <net/scm.h>
67 #include <linux/anon_inodes.h>
68 #include <linux/sched/mm.h>
69 #include <linux/uaccess.h>
70 #include <linux/nospec.h>
71 #include <linux/sizes.h>
72 #include <linux/hugetlb.h>
73 #include <linux/highmem.h>
74 #include <linux/namei.h>
75 #include <linux/fsnotify.h>
76 #include <linux/fadvise.h>
77 #include <linux/eventpoll.h>
78 #include <linux/fs_struct.h>
79 #include <linux/splice.h>
80 #include <linux/task_work.h>
81 #include <linux/pagemap.h>
82 #include <linux/io_uring.h>
83 #include <linux/blk-cgroup.h>
84 #include <linux/audit.h>
85 
86 #define CREATE_TRACE_POINTS
87 #include <trace/events/io_uring.h>
88 
89 #include <uapi/linux/io_uring.h>
90 
91 #include "internal.h"
92 #include "io-wq.h"
93 
94 #define IORING_MAX_ENTRIES	32768
95 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
96 
97 /*
98  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
99  */
100 #define IORING_FILE_TABLE_SHIFT	9
101 #define IORING_MAX_FILES_TABLE	(1U << IORING_FILE_TABLE_SHIFT)
102 #define IORING_FILE_TABLE_MASK	(IORING_MAX_FILES_TABLE - 1)
103 #define IORING_MAX_FIXED_FILES	(64 * IORING_MAX_FILES_TABLE)
104 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
105 				 IORING_REGISTER_LAST + IORING_OP_LAST)
106 
107 struct io_uring {
108 	u32 head ____cacheline_aligned_in_smp;
109 	u32 tail ____cacheline_aligned_in_smp;
110 };
111 
112 /*
113  * This data is shared with the application through the mmap at offsets
114  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
115  *
116  * The offsets to the member fields are published through struct
117  * io_sqring_offsets when calling io_uring_setup.
118  */
119 struct io_rings {
120 	/*
121 	 * Head and tail offsets into the ring; the offsets need to be
122 	 * masked to get valid indices.
123 	 *
124 	 * The kernel controls head of the sq ring and the tail of the cq ring,
125 	 * and the application controls tail of the sq ring and the head of the
126 	 * cq ring.
127 	 */
128 	struct io_uring		sq, cq;
129 	/*
130 	 * Bitmasks to apply to head and tail offsets (constant, equals
131 	 * ring_entries - 1)
132 	 */
133 	u32			sq_ring_mask, cq_ring_mask;
134 	/* Ring sizes (constant, power of 2) */
135 	u32			sq_ring_entries, cq_ring_entries;
136 	/*
137 	 * Number of invalid entries dropped by the kernel due to
138 	 * invalid index stored in array
139 	 *
140 	 * Written by the kernel, shouldn't be modified by the
141 	 * application (i.e. get number of "new events" by comparing to
142 	 * cached value).
143 	 *
144 	 * After a new SQ head value was read by the application this
145 	 * counter includes all submissions that were dropped reaching
146 	 * the new SQ head (and possibly more).
147 	 */
148 	u32			sq_dropped;
149 	/*
150 	 * Runtime SQ flags
151 	 *
152 	 * Written by the kernel, shouldn't be modified by the
153 	 * application.
154 	 *
155 	 * The application needs a full memory barrier before checking
156 	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
157 	 */
158 	u32			sq_flags;
159 	/*
160 	 * Runtime CQ flags
161 	 *
162 	 * Written by the application, shouldn't be modified by the
163 	 * kernel.
164 	 */
165 	u32                     cq_flags;
166 	/*
167 	 * Number of completion events lost because the queue was full;
168 	 * this should be avoided by the application by making sure
169 	 * there are not more requests pending than there is space in
170 	 * the completion queue.
171 	 *
172 	 * Written by the kernel, shouldn't be modified by the
173 	 * application (i.e. get number of "new events" by comparing to
174 	 * cached value).
175 	 *
176 	 * As completion events come in out of order this counter is not
177 	 * ordered with any other data.
178 	 */
179 	u32			cq_overflow;
180 	/*
181 	 * Ring buffer of completion events.
182 	 *
183 	 * The kernel writes completion events fresh every time they are
184 	 * produced, so the application is allowed to modify pending
185 	 * entries.
186 	 */
187 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
188 };
189 
190 struct io_mapped_ubuf {
191 	u64		ubuf;
192 	size_t		len;
193 	struct		bio_vec *bvec;
194 	unsigned int	nr_bvecs;
195 	unsigned long	acct_pages;
196 };
197 
198 struct fixed_file_table {
199 	struct file		**files;
200 };
201 
202 struct fixed_file_ref_node {
203 	struct percpu_ref		refs;
204 	struct list_head		node;
205 	struct list_head		file_list;
206 	struct fixed_file_data		*file_data;
207 	struct llist_node		llist;
208 	bool				done;
209 };
210 
211 struct fixed_file_data {
212 	struct fixed_file_table		*table;
213 	struct io_ring_ctx		*ctx;
214 
215 	struct fixed_file_ref_node	*node;
216 	struct percpu_ref		refs;
217 	struct completion		done;
218 	struct list_head		ref_list;
219 	spinlock_t			lock;
220 };
221 
222 struct io_buffer {
223 	struct list_head list;
224 	__u64 addr;
225 	__s32 len;
226 	__u16 bid;
227 };
228 
229 struct io_restriction {
230 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
231 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
232 	u8 sqe_flags_allowed;
233 	u8 sqe_flags_required;
234 	bool registered;
235 };
236 
237 struct io_sq_data {
238 	refcount_t		refs;
239 	struct mutex		lock;
240 
241 	/* ctx's that are using this sqd */
242 	struct list_head	ctx_list;
243 	struct list_head	ctx_new_list;
244 	struct mutex		ctx_lock;
245 
246 	struct task_struct	*thread;
247 	struct wait_queue_head	wait;
248 };
249 
250 struct io_ring_ctx {
251 	struct {
252 		struct percpu_ref	refs;
253 	} ____cacheline_aligned_in_smp;
254 
255 	struct {
256 		unsigned int		flags;
257 		unsigned int		compat: 1;
258 		unsigned int		limit_mem: 1;
259 		unsigned int		cq_overflow_flushed: 1;
260 		unsigned int		drain_next: 1;
261 		unsigned int		eventfd_async: 1;
262 		unsigned int		restricted: 1;
263 
264 		/*
265 		 * Ring buffer of indices into array of io_uring_sqe, which is
266 		 * mmapped by the application using the IORING_OFF_SQES offset.
267 		 *
268 		 * This indirection could e.g. be used to assign fixed
269 		 * io_uring_sqe entries to operations and only submit them to
270 		 * the queue when needed.
271 		 *
272 		 * The kernel modifies neither the indices array nor the entries
273 		 * array.
274 		 */
275 		u32			*sq_array;
276 		unsigned		cached_sq_head;
277 		unsigned		sq_entries;
278 		unsigned		sq_mask;
279 		unsigned		sq_thread_idle;
280 		unsigned		cached_sq_dropped;
281 		unsigned		cached_cq_overflow;
282 		unsigned long		sq_check_overflow;
283 
284 		struct list_head	defer_list;
285 		struct list_head	timeout_list;
286 		struct list_head	cq_overflow_list;
287 
288 		wait_queue_head_t	inflight_wait;
289 		struct io_uring_sqe	*sq_sqes;
290 	} ____cacheline_aligned_in_smp;
291 
292 	struct io_rings	*rings;
293 
294 	/* IO offload */
295 	struct io_wq		*io_wq;
296 
297 	/*
298 	 * For SQPOLL usage - we hold a reference to the parent task, so we
299 	 * have access to the ->files
300 	 */
301 	struct task_struct	*sqo_task;
302 
303 	/* Only used for accounting purposes */
304 	struct mm_struct	*mm_account;
305 
306 #ifdef CONFIG_BLK_CGROUP
307 	struct cgroup_subsys_state	*sqo_blkcg_css;
308 #endif
309 
310 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
311 
312 	struct wait_queue_head	sqo_sq_wait;
313 	struct wait_queue_entry	sqo_wait_entry;
314 	struct list_head	sqd_list;
315 
316 	/*
317 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
318 	 * readers must ensure that ->refs is alive as long as the file* is
319 	 * used. Only updated through io_uring_register(2).
320 	 */
321 	struct fixed_file_data	*file_data;
322 	unsigned		nr_user_files;
323 
324 	/* if used, fixed mapped user buffers */
325 	unsigned		nr_user_bufs;
326 	struct io_mapped_ubuf	*user_bufs;
327 
328 	struct user_struct	*user;
329 
330 	const struct cred	*creds;
331 
332 #ifdef CONFIG_AUDIT
333 	kuid_t			loginuid;
334 	unsigned int		sessionid;
335 #endif
336 
337 	struct completion	ref_comp;
338 	struct completion	sq_thread_comp;
339 
340 	/* if all else fails... */
341 	struct io_kiocb		*fallback_req;
342 
343 #if defined(CONFIG_UNIX)
344 	struct socket		*ring_sock;
345 #endif
346 
347 	struct idr		io_buffer_idr;
348 
349 	struct idr		personality_idr;
350 
351 	struct {
352 		unsigned		cached_cq_tail;
353 		unsigned		cq_entries;
354 		unsigned		cq_mask;
355 		atomic_t		cq_timeouts;
356 		unsigned long		cq_check_overflow;
357 		struct wait_queue_head	cq_wait;
358 		struct fasync_struct	*cq_fasync;
359 		struct eventfd_ctx	*cq_ev_fd;
360 	} ____cacheline_aligned_in_smp;
361 
362 	struct {
363 		struct mutex		uring_lock;
364 		wait_queue_head_t	wait;
365 	} ____cacheline_aligned_in_smp;
366 
367 	struct {
368 		spinlock_t		completion_lock;
369 
370 		/*
371 		 * ->iopoll_list is protected by the ctx->uring_lock for
372 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
373 		 * For SQPOLL, only the single threaded io_sq_thread() will
374 		 * manipulate the list, hence no extra locking is needed there.
375 		 */
376 		struct list_head	iopoll_list;
377 		struct hlist_head	*cancel_hash;
378 		unsigned		cancel_hash_bits;
379 		bool			poll_multi_file;
380 
381 		spinlock_t		inflight_lock;
382 		struct list_head	inflight_list;
383 	} ____cacheline_aligned_in_smp;
384 
385 	struct delayed_work		file_put_work;
386 	struct llist_head		file_put_llist;
387 
388 	struct work_struct		exit_work;
389 	struct io_restriction		restrictions;
390 };
391 
392 /*
393  * First field must be the file pointer in all the
394  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
395  */
396 struct io_poll_iocb {
397 	struct file			*file;
398 	union {
399 		struct wait_queue_head	*head;
400 		u64			addr;
401 	};
402 	__poll_t			events;
403 	bool				done;
404 	bool				canceled;
405 	struct wait_queue_entry		wait;
406 };
407 
408 struct io_close {
409 	struct file			*file;
410 	struct file			*put_file;
411 	int				fd;
412 };
413 
414 struct io_timeout_data {
415 	struct io_kiocb			*req;
416 	struct hrtimer			timer;
417 	struct timespec64		ts;
418 	enum hrtimer_mode		mode;
419 };
420 
421 struct io_accept {
422 	struct file			*file;
423 	struct sockaddr __user		*addr;
424 	int __user			*addr_len;
425 	int				flags;
426 	unsigned long			nofile;
427 };
428 
429 struct io_sync {
430 	struct file			*file;
431 	loff_t				len;
432 	loff_t				off;
433 	int				flags;
434 	int				mode;
435 };
436 
437 struct io_cancel {
438 	struct file			*file;
439 	u64				addr;
440 };
441 
442 struct io_timeout {
443 	struct file			*file;
444 	u32				off;
445 	u32				target_seq;
446 	struct list_head		list;
447 };
448 
449 struct io_timeout_rem {
450 	struct file			*file;
451 	u64				addr;
452 };
453 
454 struct io_rw {
455 	/* NOTE: kiocb has the file as the first member, so don't do it here */
456 	struct kiocb			kiocb;
457 	u64				addr;
458 	u64				len;
459 };
460 
461 struct io_connect {
462 	struct file			*file;
463 	struct sockaddr __user		*addr;
464 	int				addr_len;
465 };
466 
467 struct io_sr_msg {
468 	struct file			*file;
469 	union {
470 		struct user_msghdr __user *umsg;
471 		void __user		*buf;
472 	};
473 	int				msg_flags;
474 	int				bgid;
475 	size_t				len;
476 	struct io_buffer		*kbuf;
477 };
478 
479 struct io_open {
480 	struct file			*file;
481 	int				dfd;
482 	bool				ignore_nonblock;
483 	struct filename			*filename;
484 	struct open_how			how;
485 	unsigned long			nofile;
486 };
487 
488 struct io_files_update {
489 	struct file			*file;
490 	u64				arg;
491 	u32				nr_args;
492 	u32				offset;
493 };
494 
495 struct io_fadvise {
496 	struct file			*file;
497 	u64				offset;
498 	u32				len;
499 	u32				advice;
500 };
501 
502 struct io_madvise {
503 	struct file			*file;
504 	u64				addr;
505 	u32				len;
506 	u32				advice;
507 };
508 
509 struct io_epoll {
510 	struct file			*file;
511 	int				epfd;
512 	int				op;
513 	int				fd;
514 	struct epoll_event		event;
515 };
516 
517 struct io_splice {
518 	struct file			*file_out;
519 	struct file			*file_in;
520 	loff_t				off_out;
521 	loff_t				off_in;
522 	u64				len;
523 	unsigned int			flags;
524 };
525 
526 struct io_provide_buf {
527 	struct file			*file;
528 	__u64				addr;
529 	__s32				len;
530 	__u32				bgid;
531 	__u16				nbufs;
532 	__u16				bid;
533 };
534 
535 struct io_statx {
536 	struct file			*file;
537 	int				dfd;
538 	unsigned int			mask;
539 	unsigned int			flags;
540 	const char __user		*filename;
541 	struct statx __user		*buffer;
542 };
543 
544 struct io_completion {
545 	struct file			*file;
546 	struct list_head		list;
547 	int				cflags;
548 };
549 
550 struct io_async_connect {
551 	struct sockaddr_storage		address;
552 };
553 
554 struct io_async_msghdr {
555 	struct iovec			fast_iov[UIO_FASTIOV];
556 	struct iovec			*iov;
557 	struct sockaddr __user		*uaddr;
558 	struct msghdr			msg;
559 	struct sockaddr_storage		addr;
560 };
561 
562 struct io_async_rw {
563 	struct iovec			fast_iov[UIO_FASTIOV];
564 	const struct iovec		*free_iovec;
565 	struct iov_iter			iter;
566 	size_t				bytes_done;
567 	struct wait_page_queue		wpq;
568 };
569 
570 enum {
571 	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
572 	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
573 	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
574 	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
575 	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
576 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
577 
578 	REQ_F_LINK_HEAD_BIT,
579 	REQ_F_FAIL_LINK_BIT,
580 	REQ_F_INFLIGHT_BIT,
581 	REQ_F_CUR_POS_BIT,
582 	REQ_F_NOWAIT_BIT,
583 	REQ_F_LINK_TIMEOUT_BIT,
584 	REQ_F_ISREG_BIT,
585 	REQ_F_NEED_CLEANUP_BIT,
586 	REQ_F_POLLED_BIT,
587 	REQ_F_BUFFER_SELECTED_BIT,
588 	REQ_F_NO_FILE_TABLE_BIT,
589 	REQ_F_WORK_INITIALIZED_BIT,
590 	REQ_F_LTIMEOUT_ACTIVE_BIT,
591 
592 	/* not a real bit, just to check we're not overflowing the space */
593 	__REQ_F_LAST_BIT,
594 };
595 
596 enum {
597 	/* ctx owns file */
598 	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
599 	/* drain existing IO first */
600 	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
601 	/* linked sqes */
602 	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
603 	/* doesn't sever on completion < 0 */
604 	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
605 	/* IOSQE_ASYNC */
606 	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
607 	/* IOSQE_BUFFER_SELECT */
608 	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
609 
610 	/* head of a link */
611 	REQ_F_LINK_HEAD		= BIT(REQ_F_LINK_HEAD_BIT),
612 	/* fail rest of links */
613 	REQ_F_FAIL_LINK		= BIT(REQ_F_FAIL_LINK_BIT),
614 	/* on inflight list */
615 	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
616 	/* read/write uses file position */
617 	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
618 	/* must not punt to workers */
619 	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
620 	/* has or had linked timeout */
621 	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
622 	/* regular file */
623 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
624 	/* needs cleanup */
625 	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
626 	/* already went through poll handler */
627 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
628 	/* buffer already selected */
629 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
630 	/* doesn't need file table for this request */
631 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
632 	/* io_wq_work is initialized */
633 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
634 	/* linked timeout is active, i.e. prepared by link's head */
635 	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
636 };
637 
638 struct async_poll {
639 	struct io_poll_iocb	poll;
640 	struct io_poll_iocb	*double_poll;
641 };
642 
643 /*
644  * NOTE! Each of the iocb union members has the file pointer
645  * as the first entry in their struct definition. So you can
646  * access the file pointer through any of the sub-structs,
647  * or directly as just 'ki_filp' in this struct.
648  */
649 struct io_kiocb {
650 	union {
651 		struct file		*file;
652 		struct io_rw		rw;
653 		struct io_poll_iocb	poll;
654 		struct io_accept	accept;
655 		struct io_sync		sync;
656 		struct io_cancel	cancel;
657 		struct io_timeout	timeout;
658 		struct io_timeout_rem	timeout_rem;
659 		struct io_connect	connect;
660 		struct io_sr_msg	sr_msg;
661 		struct io_open		open;
662 		struct io_close		close;
663 		struct io_files_update	files_update;
664 		struct io_fadvise	fadvise;
665 		struct io_madvise	madvise;
666 		struct io_epoll		epoll;
667 		struct io_splice	splice;
668 		struct io_provide_buf	pbuf;
669 		struct io_statx		statx;
670 		/* use only after cleaning per-op data, see io_clean_op() */
671 		struct io_completion	compl;
672 	};
673 
674 	/* opcode allocated if it needs to store data for async defer */
675 	void				*async_data;
676 	u8				opcode;
677 	/* polled IO has completed */
678 	u8				iopoll_completed;
679 
680 	u16				buf_index;
681 	u32				result;
682 
683 	struct io_ring_ctx		*ctx;
684 	unsigned int			flags;
685 	refcount_t			refs;
686 	struct task_struct		*task;
687 	u64				user_data;
688 
689 	struct list_head		link_list;
690 
691 	/*
692 	 * 1. used with ctx->iopoll_list with reads/writes
693 	 * 2. to track reqs with ->files (see io_op_def::file_table)
694 	 */
695 	struct list_head		inflight_entry;
696 
697 	struct percpu_ref		*fixed_file_refs;
698 	struct callback_head		task_work;
699 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
700 	struct hlist_node		hash_node;
701 	struct async_poll		*apoll;
702 	struct io_wq_work		work;
703 };
704 
705 struct io_defer_entry {
706 	struct list_head	list;
707 	struct io_kiocb		*req;
708 	u32			seq;
709 };
710 
711 #define IO_IOPOLL_BATCH			8
712 
713 struct io_comp_state {
714 	unsigned int		nr;
715 	struct list_head	list;
716 	struct io_ring_ctx	*ctx;
717 };
718 
719 struct io_submit_state {
720 	struct blk_plug		plug;
721 
722 	/*
723 	 * io_kiocb alloc cache
724 	 */
725 	void			*reqs[IO_IOPOLL_BATCH];
726 	unsigned int		free_reqs;
727 
728 	/*
729 	 * Batch completion logic
730 	 */
731 	struct io_comp_state	comp;
732 
733 	/*
734 	 * File reference cache
735 	 */
736 	struct file		*file;
737 	unsigned int		fd;
738 	unsigned int		has_refs;
739 	unsigned int		ios_left;
740 };
741 
742 struct io_op_def {
743 	/* needs req->file assigned */
744 	unsigned		needs_file : 1;
745 	/* don't fail if file grab fails */
746 	unsigned		needs_file_no_error : 1;
747 	/* hash wq insertion if file is a regular file */
748 	unsigned		hash_reg_file : 1;
749 	/* unbound wq insertion if file is a non-regular file */
750 	unsigned		unbound_nonreg_file : 1;
751 	/* opcode is not supported by this kernel */
752 	unsigned		not_supported : 1;
753 	/* set if opcode supports polled "wait" */
754 	unsigned		pollin : 1;
755 	unsigned		pollout : 1;
756 	/* op supports buffer selection */
757 	unsigned		buffer_select : 1;
758 	/* must always have async data allocated */
759 	unsigned		needs_async_data : 1;
760 	/* size of async data needed, if any */
761 	unsigned short		async_size;
762 	unsigned		work_flags;
763 };
764 
765 static const struct io_op_def io_op_defs[] = {
766 	[IORING_OP_NOP] = {},
767 	[IORING_OP_READV] = {
768 		.needs_file		= 1,
769 		.unbound_nonreg_file	= 1,
770 		.pollin			= 1,
771 		.buffer_select		= 1,
772 		.needs_async_data	= 1,
773 		.async_size		= sizeof(struct io_async_rw),
774 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
775 	},
776 	[IORING_OP_WRITEV] = {
777 		.needs_file		= 1,
778 		.hash_reg_file		= 1,
779 		.unbound_nonreg_file	= 1,
780 		.pollout		= 1,
781 		.needs_async_data	= 1,
782 		.async_size		= sizeof(struct io_async_rw),
783 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
784 						IO_WQ_WORK_FSIZE,
785 	},
786 	[IORING_OP_FSYNC] = {
787 		.needs_file		= 1,
788 		.work_flags		= IO_WQ_WORK_BLKCG,
789 	},
790 	[IORING_OP_READ_FIXED] = {
791 		.needs_file		= 1,
792 		.unbound_nonreg_file	= 1,
793 		.pollin			= 1,
794 		.async_size		= sizeof(struct io_async_rw),
795 		.work_flags		= IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
796 	},
797 	[IORING_OP_WRITE_FIXED] = {
798 		.needs_file		= 1,
799 		.hash_reg_file		= 1,
800 		.unbound_nonreg_file	= 1,
801 		.pollout		= 1,
802 		.async_size		= sizeof(struct io_async_rw),
803 		.work_flags		= IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
804 						IO_WQ_WORK_MM,
805 	},
806 	[IORING_OP_POLL_ADD] = {
807 		.needs_file		= 1,
808 		.unbound_nonreg_file	= 1,
809 	},
810 	[IORING_OP_POLL_REMOVE] = {},
811 	[IORING_OP_SYNC_FILE_RANGE] = {
812 		.needs_file		= 1,
813 		.work_flags		= IO_WQ_WORK_BLKCG,
814 	},
815 	[IORING_OP_SENDMSG] = {
816 		.needs_file		= 1,
817 		.unbound_nonreg_file	= 1,
818 		.pollout		= 1,
819 		.needs_async_data	= 1,
820 		.async_size		= sizeof(struct io_async_msghdr),
821 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
822 						IO_WQ_WORK_FS,
823 	},
824 	[IORING_OP_RECVMSG] = {
825 		.needs_file		= 1,
826 		.unbound_nonreg_file	= 1,
827 		.pollin			= 1,
828 		.buffer_select		= 1,
829 		.needs_async_data	= 1,
830 		.async_size		= sizeof(struct io_async_msghdr),
831 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
832 						IO_WQ_WORK_FS,
833 	},
834 	[IORING_OP_TIMEOUT] = {
835 		.needs_async_data	= 1,
836 		.async_size		= sizeof(struct io_timeout_data),
837 		.work_flags		= IO_WQ_WORK_MM,
838 	},
839 	[IORING_OP_TIMEOUT_REMOVE] = {},
840 	[IORING_OP_ACCEPT] = {
841 		.needs_file		= 1,
842 		.unbound_nonreg_file	= 1,
843 		.pollin			= 1,
844 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
845 	},
846 	[IORING_OP_ASYNC_CANCEL] = {},
847 	[IORING_OP_LINK_TIMEOUT] = {
848 		.needs_async_data	= 1,
849 		.async_size		= sizeof(struct io_timeout_data),
850 		.work_flags		= IO_WQ_WORK_MM,
851 	},
852 	[IORING_OP_CONNECT] = {
853 		.needs_file		= 1,
854 		.unbound_nonreg_file	= 1,
855 		.pollout		= 1,
856 		.needs_async_data	= 1,
857 		.async_size		= sizeof(struct io_async_connect),
858 		.work_flags		= IO_WQ_WORK_MM,
859 	},
860 	[IORING_OP_FALLOCATE] = {
861 		.needs_file		= 1,
862 		.work_flags		= IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
863 	},
864 	[IORING_OP_OPENAT] = {
865 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
866 						IO_WQ_WORK_FS,
867 	},
868 	[IORING_OP_CLOSE] = {
869 		.needs_file		= 1,
870 		.needs_file_no_error	= 1,
871 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
872 	},
873 	[IORING_OP_FILES_UPDATE] = {
874 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
875 	},
876 	[IORING_OP_STATX] = {
877 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
878 						IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
879 	},
880 	[IORING_OP_READ] = {
881 		.needs_file		= 1,
882 		.unbound_nonreg_file	= 1,
883 		.pollin			= 1,
884 		.buffer_select		= 1,
885 		.async_size		= sizeof(struct io_async_rw),
886 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
887 	},
888 	[IORING_OP_WRITE] = {
889 		.needs_file		= 1,
890 		.unbound_nonreg_file	= 1,
891 		.pollout		= 1,
892 		.async_size		= sizeof(struct io_async_rw),
893 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
894 						IO_WQ_WORK_FSIZE,
895 	},
896 	[IORING_OP_FADVISE] = {
897 		.needs_file		= 1,
898 		.work_flags		= IO_WQ_WORK_BLKCG,
899 	},
900 	[IORING_OP_MADVISE] = {
901 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
902 	},
903 	[IORING_OP_SEND] = {
904 		.needs_file		= 1,
905 		.unbound_nonreg_file	= 1,
906 		.pollout		= 1,
907 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
908 	},
909 	[IORING_OP_RECV] = {
910 		.needs_file		= 1,
911 		.unbound_nonreg_file	= 1,
912 		.pollin			= 1,
913 		.buffer_select		= 1,
914 		.work_flags		= IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
915 	},
916 	[IORING_OP_OPENAT2] = {
917 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
918 						IO_WQ_WORK_BLKCG,
919 	},
920 	[IORING_OP_EPOLL_CTL] = {
921 		.unbound_nonreg_file	= 1,
922 		.work_flags		= IO_WQ_WORK_FILES,
923 	},
924 	[IORING_OP_SPLICE] = {
925 		.needs_file		= 1,
926 		.hash_reg_file		= 1,
927 		.unbound_nonreg_file	= 1,
928 		.work_flags		= IO_WQ_WORK_BLKCG,
929 	},
930 	[IORING_OP_PROVIDE_BUFFERS] = {},
931 	[IORING_OP_REMOVE_BUFFERS] = {},
932 	[IORING_OP_TEE] = {
933 		.needs_file		= 1,
934 		.hash_reg_file		= 1,
935 		.unbound_nonreg_file	= 1,
936 	},
937 };
938 
939 enum io_mem_account {
940 	ACCT_LOCKED,
941 	ACCT_PINNED,
942 };
943 
944 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
945 			     struct io_comp_state *cs);
946 static void io_cqring_fill_event(struct io_kiocb *req, long res);
947 static void io_put_req(struct io_kiocb *req);
948 static void io_put_req_deferred(struct io_kiocb *req, int nr);
949 static void io_double_put_req(struct io_kiocb *req);
950 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
951 static void __io_queue_linked_timeout(struct io_kiocb *req);
952 static void io_queue_linked_timeout(struct io_kiocb *req);
953 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
954 				 struct io_uring_files_update *ip,
955 				 unsigned nr_args);
956 static void __io_clean_op(struct io_kiocb *req);
957 static struct file *io_file_get(struct io_submit_state *state,
958 				struct io_kiocb *req, int fd, bool fixed);
959 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
960 static void io_file_put_work(struct work_struct *work);
961 
962 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
963 			       struct iovec **iovec, struct iov_iter *iter,
964 			       bool needs_lock);
965 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
966 			     const struct iovec *fast_iov,
967 			     struct iov_iter *iter, bool force);
968 
969 static struct kmem_cache *req_cachep;
970 
971 static const struct file_operations io_uring_fops;
972 
io_uring_get_socket(struct file * file)973 struct sock *io_uring_get_socket(struct file *file)
974 {
975 #if defined(CONFIG_UNIX)
976 	if (file->f_op == &io_uring_fops) {
977 		struct io_ring_ctx *ctx = file->private_data;
978 
979 		return ctx->ring_sock->sk;
980 	}
981 #endif
982 	return NULL;
983 }
984 EXPORT_SYMBOL(io_uring_get_socket);
985 
io_clean_op(struct io_kiocb * req)986 static inline void io_clean_op(struct io_kiocb *req)
987 {
988 	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
989 			  REQ_F_INFLIGHT))
990 		__io_clean_op(req);
991 }
992 
io_sq_thread_drop_mm(void)993 static void io_sq_thread_drop_mm(void)
994 {
995 	struct mm_struct *mm = current->mm;
996 
997 	if (mm) {
998 		kthread_unuse_mm(mm);
999 		mmput(mm);
1000 		current->mm = NULL;
1001 	}
1002 }
1003 
__io_sq_thread_acquire_mm(struct io_ring_ctx * ctx)1004 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
1005 {
1006 	struct mm_struct *mm;
1007 
1008 	if (current->mm)
1009 		return 0;
1010 
1011 	/* Should never happen */
1012 	if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
1013 		return -EFAULT;
1014 
1015 	task_lock(ctx->sqo_task);
1016 	mm = ctx->sqo_task->mm;
1017 	if (unlikely(!mm || !mmget_not_zero(mm)))
1018 		mm = NULL;
1019 	task_unlock(ctx->sqo_task);
1020 
1021 	if (mm) {
1022 		kthread_use_mm(mm);
1023 		return 0;
1024 	}
1025 
1026 	return -EFAULT;
1027 }
1028 
io_sq_thread_acquire_mm(struct io_ring_ctx * ctx,struct io_kiocb * req)1029 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
1030 				   struct io_kiocb *req)
1031 {
1032 	if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
1033 		return 0;
1034 	return __io_sq_thread_acquire_mm(ctx);
1035 }
1036 
io_sq_thread_associate_blkcg(struct io_ring_ctx * ctx,struct cgroup_subsys_state ** cur_css)1037 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
1038 					 struct cgroup_subsys_state **cur_css)
1039 
1040 {
1041 #ifdef CONFIG_BLK_CGROUP
1042 	/* puts the old one when swapping */
1043 	if (*cur_css != ctx->sqo_blkcg_css) {
1044 		kthread_associate_blkcg(ctx->sqo_blkcg_css);
1045 		*cur_css = ctx->sqo_blkcg_css;
1046 	}
1047 #endif
1048 }
1049 
io_sq_thread_unassociate_blkcg(void)1050 static void io_sq_thread_unassociate_blkcg(void)
1051 {
1052 #ifdef CONFIG_BLK_CGROUP
1053 	kthread_associate_blkcg(NULL);
1054 #endif
1055 }
1056 
req_set_fail_links(struct io_kiocb * req)1057 static inline void req_set_fail_links(struct io_kiocb *req)
1058 {
1059 	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1060 		req->flags |= REQ_F_FAIL_LINK;
1061 }
1062 
1063 /*
1064  * None of these are dereferenced, they are simply used to check if any of
1065  * them have changed. If we're under current and check they are still the
1066  * same, we're fine to grab references to them for actual out-of-line use.
1067  */
io_init_identity(struct io_identity * id)1068 static void io_init_identity(struct io_identity *id)
1069 {
1070 	id->files = current->files;
1071 	id->mm = current->mm;
1072 #ifdef CONFIG_BLK_CGROUP
1073 	rcu_read_lock();
1074 	id->blkcg_css = blkcg_css();
1075 	rcu_read_unlock();
1076 #endif
1077 	id->creds = current_cred();
1078 	id->nsproxy = current->nsproxy;
1079 	id->fs = current->fs;
1080 	id->fsize = rlimit(RLIMIT_FSIZE);
1081 #ifdef CONFIG_AUDIT
1082 	id->loginuid = current->loginuid;
1083 	id->sessionid = current->sessionid;
1084 #endif
1085 	refcount_set(&id->count, 1);
1086 }
1087 
__io_req_init_async(struct io_kiocb * req)1088 static inline void __io_req_init_async(struct io_kiocb *req)
1089 {
1090 	memset(&req->work, 0, sizeof(req->work));
1091 	req->flags |= REQ_F_WORK_INITIALIZED;
1092 }
1093 
1094 /*
1095  * Note: must call io_req_init_async() for the first time you
1096  * touch any members of io_wq_work.
1097  */
io_req_init_async(struct io_kiocb * req)1098 static inline void io_req_init_async(struct io_kiocb *req)
1099 {
1100 	struct io_uring_task *tctx = current->io_uring;
1101 
1102 	if (req->flags & REQ_F_WORK_INITIALIZED)
1103 		return;
1104 
1105 	__io_req_init_async(req);
1106 
1107 	/* Grab a ref if this isn't our static identity */
1108 	req->work.identity = tctx->identity;
1109 	if (tctx->identity != &tctx->__identity)
1110 		refcount_inc(&req->work.identity->count);
1111 }
1112 
io_async_submit(struct io_ring_ctx * ctx)1113 static inline bool io_async_submit(struct io_ring_ctx *ctx)
1114 {
1115 	return ctx->flags & IORING_SETUP_SQPOLL;
1116 }
1117 
io_ring_ctx_ref_free(struct percpu_ref * ref)1118 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1119 {
1120 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1121 
1122 	complete(&ctx->ref_comp);
1123 }
1124 
io_is_timeout_noseq(struct io_kiocb * req)1125 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1126 {
1127 	return !req->timeout.off;
1128 }
1129 
io_ring_ctx_alloc(struct io_uring_params * p)1130 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1131 {
1132 	struct io_ring_ctx *ctx;
1133 	int hash_bits;
1134 
1135 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1136 	if (!ctx)
1137 		return NULL;
1138 
1139 	ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
1140 	if (!ctx->fallback_req)
1141 		goto err;
1142 
1143 	/*
1144 	 * Use 5 bits less than the max cq entries, that should give us around
1145 	 * 32 entries per hash list if totally full and uniformly spread.
1146 	 */
1147 	hash_bits = ilog2(p->cq_entries);
1148 	hash_bits -= 5;
1149 	if (hash_bits <= 0)
1150 		hash_bits = 1;
1151 	ctx->cancel_hash_bits = hash_bits;
1152 	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1153 					GFP_KERNEL);
1154 	if (!ctx->cancel_hash)
1155 		goto err;
1156 	__hash_init(ctx->cancel_hash, 1U << hash_bits);
1157 
1158 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1159 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1160 		goto err;
1161 
1162 	ctx->flags = p->flags;
1163 	init_waitqueue_head(&ctx->sqo_sq_wait);
1164 	INIT_LIST_HEAD(&ctx->sqd_list);
1165 	init_waitqueue_head(&ctx->cq_wait);
1166 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
1167 	init_completion(&ctx->ref_comp);
1168 	init_completion(&ctx->sq_thread_comp);
1169 	idr_init(&ctx->io_buffer_idr);
1170 	idr_init(&ctx->personality_idr);
1171 	mutex_init(&ctx->uring_lock);
1172 	init_waitqueue_head(&ctx->wait);
1173 	spin_lock_init(&ctx->completion_lock);
1174 	INIT_LIST_HEAD(&ctx->iopoll_list);
1175 	INIT_LIST_HEAD(&ctx->defer_list);
1176 	INIT_LIST_HEAD(&ctx->timeout_list);
1177 	init_waitqueue_head(&ctx->inflight_wait);
1178 	spin_lock_init(&ctx->inflight_lock);
1179 	INIT_LIST_HEAD(&ctx->inflight_list);
1180 	INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1181 	init_llist_head(&ctx->file_put_llist);
1182 	return ctx;
1183 err:
1184 	if (ctx->fallback_req)
1185 		kmem_cache_free(req_cachep, ctx->fallback_req);
1186 	kfree(ctx->cancel_hash);
1187 	kfree(ctx);
1188 	return NULL;
1189 }
1190 
req_need_defer(struct io_kiocb * req,u32 seq)1191 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1192 {
1193 	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1194 		struct io_ring_ctx *ctx = req->ctx;
1195 
1196 		return seq != ctx->cached_cq_tail
1197 				+ READ_ONCE(ctx->cached_cq_overflow);
1198 	}
1199 
1200 	return false;
1201 }
1202 
__io_commit_cqring(struct io_ring_ctx * ctx)1203 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1204 {
1205 	struct io_rings *rings = ctx->rings;
1206 
1207 	/* order cqe stores with ring update */
1208 	smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1209 
1210 	if (wq_has_sleeper(&ctx->cq_wait)) {
1211 		wake_up_interruptible(&ctx->cq_wait);
1212 		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1213 	}
1214 }
1215 
io_put_identity(struct io_uring_task * tctx,struct io_kiocb * req)1216 static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
1217 {
1218 	if (req->work.identity == &tctx->__identity)
1219 		return;
1220 	if (refcount_dec_and_test(&req->work.identity->count))
1221 		kfree(req->work.identity);
1222 }
1223 
io_req_clean_work(struct io_kiocb * req)1224 static void io_req_clean_work(struct io_kiocb *req)
1225 {
1226 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
1227 		return;
1228 
1229 	req->flags &= ~REQ_F_WORK_INITIALIZED;
1230 
1231 	if (req->work.flags & IO_WQ_WORK_MM) {
1232 		mmdrop(req->work.identity->mm);
1233 		req->work.flags &= ~IO_WQ_WORK_MM;
1234 	}
1235 #ifdef CONFIG_BLK_CGROUP
1236 	if (req->work.flags & IO_WQ_WORK_BLKCG) {
1237 		css_put(req->work.identity->blkcg_css);
1238 		req->work.flags &= ~IO_WQ_WORK_BLKCG;
1239 	}
1240 #endif
1241 	if (req->work.flags & IO_WQ_WORK_CREDS) {
1242 		put_cred(req->work.identity->creds);
1243 		req->work.flags &= ~IO_WQ_WORK_CREDS;
1244 	}
1245 	if (req->work.flags & IO_WQ_WORK_FS) {
1246 		struct fs_struct *fs = req->work.identity->fs;
1247 
1248 		spin_lock(&req->work.identity->fs->lock);
1249 		if (--fs->users)
1250 			fs = NULL;
1251 		spin_unlock(&req->work.identity->fs->lock);
1252 		if (fs)
1253 			free_fs_struct(fs);
1254 		req->work.flags &= ~IO_WQ_WORK_FS;
1255 	}
1256 
1257 	io_put_identity(req->task->io_uring, req);
1258 }
1259 
1260 /*
1261  * Create a private copy of io_identity, since some fields don't match
1262  * the current context.
1263  */
io_identity_cow(struct io_kiocb * req)1264 static bool io_identity_cow(struct io_kiocb *req)
1265 {
1266 	struct io_uring_task *tctx = current->io_uring;
1267 	const struct cred *creds = NULL;
1268 	struct io_identity *id;
1269 
1270 	if (req->work.flags & IO_WQ_WORK_CREDS)
1271 		creds = req->work.identity->creds;
1272 
1273 	id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
1274 	if (unlikely(!id)) {
1275 		req->work.flags |= IO_WQ_WORK_CANCEL;
1276 		return false;
1277 	}
1278 
1279 	/*
1280 	 * We can safely just re-init the creds we copied  Either the field
1281 	 * matches the current one, or we haven't grabbed it yet. The only
1282 	 * exception is ->creds, through registered personalities, so handle
1283 	 * that one separately.
1284 	 */
1285 	io_init_identity(id);
1286 	if (creds)
1287 		id->creds = creds;
1288 
1289 	/* add one for this request */
1290 	refcount_inc(&id->count);
1291 
1292 	/* drop tctx and req identity references, if needed */
1293 	if (tctx->identity != &tctx->__identity &&
1294 	    refcount_dec_and_test(&tctx->identity->count))
1295 		kfree(tctx->identity);
1296 	if (req->work.identity != &tctx->__identity &&
1297 	    refcount_dec_and_test(&req->work.identity->count))
1298 		kfree(req->work.identity);
1299 
1300 	req->work.identity = id;
1301 	tctx->identity = id;
1302 	return true;
1303 }
1304 
io_grab_identity(struct io_kiocb * req)1305 static bool io_grab_identity(struct io_kiocb *req)
1306 {
1307 	const struct io_op_def *def = &io_op_defs[req->opcode];
1308 	struct io_identity *id = req->work.identity;
1309 	struct io_ring_ctx *ctx = req->ctx;
1310 
1311 	if (def->work_flags & IO_WQ_WORK_FSIZE) {
1312 		if (id->fsize != rlimit(RLIMIT_FSIZE))
1313 			return false;
1314 		req->work.flags |= IO_WQ_WORK_FSIZE;
1315 	}
1316 #ifdef CONFIG_BLK_CGROUP
1317 	if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
1318 	    (def->work_flags & IO_WQ_WORK_BLKCG)) {
1319 		rcu_read_lock();
1320 		if (id->blkcg_css != blkcg_css()) {
1321 			rcu_read_unlock();
1322 			return false;
1323 		}
1324 		/*
1325 		 * This should be rare, either the cgroup is dying or the task
1326 		 * is moving cgroups. Just punt to root for the handful of ios.
1327 		 */
1328 		if (css_tryget_online(id->blkcg_css))
1329 			req->work.flags |= IO_WQ_WORK_BLKCG;
1330 		rcu_read_unlock();
1331 	}
1332 #endif
1333 	if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
1334 		if (id->creds != current_cred())
1335 			return false;
1336 		get_cred(id->creds);
1337 		req->work.flags |= IO_WQ_WORK_CREDS;
1338 	}
1339 #ifdef CONFIG_AUDIT
1340 	if (!uid_eq(current->loginuid, id->loginuid) ||
1341 	    current->sessionid != id->sessionid)
1342 		return false;
1343 #endif
1344 	if (!(req->work.flags & IO_WQ_WORK_FS) &&
1345 	    (def->work_flags & IO_WQ_WORK_FS)) {
1346 		if (current->fs != id->fs)
1347 			return false;
1348 		spin_lock(&id->fs->lock);
1349 		if (!id->fs->in_exec) {
1350 			id->fs->users++;
1351 			req->work.flags |= IO_WQ_WORK_FS;
1352 		} else {
1353 			req->work.flags |= IO_WQ_WORK_CANCEL;
1354 		}
1355 		spin_unlock(¤t->fs->lock);
1356 	}
1357 	if (!(req->work.flags & IO_WQ_WORK_FILES) &&
1358 	    (def->work_flags & IO_WQ_WORK_FILES) &&
1359 	    !(req->flags & REQ_F_NO_FILE_TABLE)) {
1360 		if (id->files != current->files ||
1361 		    id->nsproxy != current->nsproxy)
1362 			return false;
1363 		atomic_inc(&id->files->count);
1364 		get_nsproxy(id->nsproxy);
1365 		req->flags |= REQ_F_INFLIGHT;
1366 
1367 		spin_lock_irq(&ctx->inflight_lock);
1368 		list_add(&req->inflight_entry, &ctx->inflight_list);
1369 		spin_unlock_irq(&ctx->inflight_lock);
1370 		req->work.flags |= IO_WQ_WORK_FILES;
1371 	}
1372 
1373 	return true;
1374 }
1375 
io_prep_async_work(struct io_kiocb * req)1376 static void io_prep_async_work(struct io_kiocb *req)
1377 {
1378 	const struct io_op_def *def = &io_op_defs[req->opcode];
1379 	struct io_ring_ctx *ctx = req->ctx;
1380 	struct io_identity *id;
1381 
1382 	io_req_init_async(req);
1383 	id = req->work.identity;
1384 
1385 	if (req->flags & REQ_F_FORCE_ASYNC)
1386 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
1387 
1388 	if (req->flags & REQ_F_ISREG) {
1389 		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1390 			io_wq_hash_work(&req->work, file_inode(req->file));
1391 	} else {
1392 		if (def->unbound_nonreg_file)
1393 			req->work.flags |= IO_WQ_WORK_UNBOUND;
1394 	}
1395 
1396 	/* ->mm can never change on us */
1397 	if (!(req->work.flags & IO_WQ_WORK_MM) &&
1398 	    (def->work_flags & IO_WQ_WORK_MM)) {
1399 		mmgrab(id->mm);
1400 		req->work.flags |= IO_WQ_WORK_MM;
1401 	}
1402 
1403 	/* if we fail grabbing identity, we must COW, regrab, and retry */
1404 	if (io_grab_identity(req))
1405 		return;
1406 
1407 	if (!io_identity_cow(req))
1408 		return;
1409 
1410 	/* can't fail at this point */
1411 	if (!io_grab_identity(req))
1412 		WARN_ON(1);
1413 }
1414 
io_prep_async_link(struct io_kiocb * req)1415 static void io_prep_async_link(struct io_kiocb *req)
1416 {
1417 	struct io_kiocb *cur;
1418 
1419 	io_prep_async_work(req);
1420 	if (req->flags & REQ_F_LINK_HEAD)
1421 		list_for_each_entry(cur, &req->link_list, link_list)
1422 			io_prep_async_work(cur);
1423 }
1424 
__io_queue_async_work(struct io_kiocb * req)1425 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
1426 {
1427 	struct io_ring_ctx *ctx = req->ctx;
1428 	struct io_kiocb *link = io_prep_linked_timeout(req);
1429 
1430 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1431 					&req->work, req->flags);
1432 	io_wq_enqueue(ctx->io_wq, &req->work);
1433 	return link;
1434 }
1435 
io_queue_async_work(struct io_kiocb * req)1436 static void io_queue_async_work(struct io_kiocb *req)
1437 {
1438 	struct io_kiocb *link;
1439 
1440 	/* init ->work of the whole link before punting */
1441 	io_prep_async_link(req);
1442 	link = __io_queue_async_work(req);
1443 
1444 	if (link)
1445 		io_queue_linked_timeout(link);
1446 }
1447 
io_kill_timeout(struct io_kiocb * req)1448 static void io_kill_timeout(struct io_kiocb *req)
1449 {
1450 	struct io_timeout_data *io = req->async_data;
1451 	int ret;
1452 
1453 	ret = hrtimer_try_to_cancel(&io->timer);
1454 	if (ret != -1) {
1455 		atomic_set(&req->ctx->cq_timeouts,
1456 			atomic_read(&req->ctx->cq_timeouts) + 1);
1457 		list_del_init(&req->timeout.list);
1458 		io_cqring_fill_event(req, 0);
1459 		io_put_req_deferred(req, 1);
1460 	}
1461 }
1462 
io_task_match(struct io_kiocb * req,struct task_struct * tsk)1463 static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
1464 {
1465 	struct io_ring_ctx *ctx = req->ctx;
1466 
1467 	if (!tsk || req->task == tsk)
1468 		return true;
1469 	if (ctx->flags & IORING_SETUP_SQPOLL) {
1470 		if (ctx->sq_data && req->task == ctx->sq_data->thread)
1471 			return true;
1472 	}
1473 	return false;
1474 }
1475 
1476 /*
1477  * Returns true if we found and killed one or more timeouts
1478  */
io_kill_timeouts(struct io_ring_ctx * ctx,struct task_struct * tsk)1479 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
1480 {
1481 	struct io_kiocb *req, *tmp;
1482 	int canceled = 0;
1483 
1484 	spin_lock_irq(&ctx->completion_lock);
1485 	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1486 		if (io_task_match(req, tsk)) {
1487 			io_kill_timeout(req);
1488 			canceled++;
1489 		}
1490 	}
1491 	spin_unlock_irq(&ctx->completion_lock);
1492 	return canceled != 0;
1493 }
1494 
__io_queue_deferred(struct io_ring_ctx * ctx)1495 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1496 {
1497 	do {
1498 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1499 						struct io_defer_entry, list);
1500 		struct io_kiocb *link;
1501 
1502 		if (req_need_defer(de->req, de->seq))
1503 			break;
1504 		list_del_init(&de->list);
1505 		/* punt-init is done before queueing for defer */
1506 		link = __io_queue_async_work(de->req);
1507 		if (link) {
1508 			__io_queue_linked_timeout(link);
1509 			/* drop submission reference */
1510 			io_put_req_deferred(link, 1);
1511 		}
1512 		kfree(de);
1513 	} while (!list_empty(&ctx->defer_list));
1514 }
1515 
io_flush_timeouts(struct io_ring_ctx * ctx)1516 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1517 {
1518 	while (!list_empty(&ctx->timeout_list)) {
1519 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1520 						struct io_kiocb, timeout.list);
1521 
1522 		if (io_is_timeout_noseq(req))
1523 			break;
1524 		if (req->timeout.target_seq != ctx->cached_cq_tail
1525 					- atomic_read(&ctx->cq_timeouts))
1526 			break;
1527 
1528 		list_del_init(&req->timeout.list);
1529 		io_kill_timeout(req);
1530 	}
1531 }
1532 
io_commit_cqring(struct io_ring_ctx * ctx)1533 static void io_commit_cqring(struct io_ring_ctx *ctx)
1534 {
1535 	io_flush_timeouts(ctx);
1536 	__io_commit_cqring(ctx);
1537 
1538 	if (unlikely(!list_empty(&ctx->defer_list)))
1539 		__io_queue_deferred(ctx);
1540 }
1541 
io_sqring_full(struct io_ring_ctx * ctx)1542 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1543 {
1544 	struct io_rings *r = ctx->rings;
1545 
1546 	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1547 }
1548 
io_get_cqring(struct io_ring_ctx * ctx)1549 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1550 {
1551 	struct io_rings *rings = ctx->rings;
1552 	unsigned tail;
1553 
1554 	tail = ctx->cached_cq_tail;
1555 	/*
1556 	 * writes to the cq entry need to come after reading head; the
1557 	 * control dependency is enough as we're using WRITE_ONCE to
1558 	 * fill the cq entry
1559 	 */
1560 	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1561 		return NULL;
1562 
1563 	ctx->cached_cq_tail++;
1564 	return &rings->cqes[tail & ctx->cq_mask];
1565 }
1566 
io_should_trigger_evfd(struct io_ring_ctx * ctx)1567 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1568 {
1569 	if (!ctx->cq_ev_fd)
1570 		return false;
1571 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1572 		return false;
1573 	if (!ctx->eventfd_async)
1574 		return true;
1575 	return io_wq_current_is_worker();
1576 }
1577 
io_cqring_ev_posted(struct io_ring_ctx * ctx)1578 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1579 {
1580 	if (waitqueue_active(&ctx->wait))
1581 		wake_up(&ctx->wait);
1582 	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1583 		wake_up(&ctx->sq_data->wait);
1584 	if (io_should_trigger_evfd(ctx))
1585 		eventfd_signal(ctx->cq_ev_fd, 1);
1586 }
1587 
io_cqring_mark_overflow(struct io_ring_ctx * ctx)1588 static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
1589 {
1590 	if (list_empty(&ctx->cq_overflow_list)) {
1591 		clear_bit(0, &ctx->sq_check_overflow);
1592 		clear_bit(0, &ctx->cq_check_overflow);
1593 		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1594 	}
1595 }
1596 
__io_match_files(struct io_kiocb * req,struct files_struct * files)1597 static inline bool __io_match_files(struct io_kiocb *req,
1598 				    struct files_struct *files)
1599 {
1600 	return ((req->flags & REQ_F_WORK_INITIALIZED) &&
1601 	        (req->work.flags & IO_WQ_WORK_FILES)) &&
1602 		req->work.identity->files == files;
1603 }
1604 
io_match_files(struct io_kiocb * req,struct files_struct * files)1605 static bool io_match_files(struct io_kiocb *req,
1606 			   struct files_struct *files)
1607 {
1608 	struct io_kiocb *link;
1609 
1610 	if (!files)
1611 		return true;
1612 	if (__io_match_files(req, files))
1613 		return true;
1614 	if (req->flags & REQ_F_LINK_HEAD) {
1615 		list_for_each_entry(link, &req->link_list, link_list) {
1616 			if (__io_match_files(link, files))
1617 				return true;
1618 		}
1619 	}
1620 	return false;
1621 }
1622 
1623 /* Returns true if there are no backlogged entries after the flush */
io_cqring_overflow_flush(struct io_ring_ctx * ctx,bool force,struct task_struct * tsk,struct files_struct * files)1624 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1625 				     struct task_struct *tsk,
1626 				     struct files_struct *files)
1627 {
1628 	struct io_rings *rings = ctx->rings;
1629 	struct io_kiocb *req, *tmp;
1630 	struct io_uring_cqe *cqe;
1631 	unsigned long flags;
1632 	LIST_HEAD(list);
1633 
1634 	if (!force) {
1635 		if (list_empty_careful(&ctx->cq_overflow_list))
1636 			return true;
1637 		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1638 		    rings->cq_ring_entries))
1639 			return false;
1640 	}
1641 
1642 	spin_lock_irqsave(&ctx->completion_lock, flags);
1643 
1644 	/* if force is set, the ring is going away. always drop after that */
1645 	if (force)
1646 		ctx->cq_overflow_flushed = 1;
1647 
1648 	cqe = NULL;
1649 	list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1650 		if (tsk && req->task != tsk)
1651 			continue;
1652 		if (!io_match_files(req, files))
1653 			continue;
1654 
1655 		cqe = io_get_cqring(ctx);
1656 		if (!cqe && !force)
1657 			break;
1658 
1659 		list_move(&req->compl.list, &list);
1660 		if (cqe) {
1661 			WRITE_ONCE(cqe->user_data, req->user_data);
1662 			WRITE_ONCE(cqe->res, req->result);
1663 			WRITE_ONCE(cqe->flags, req->compl.cflags);
1664 		} else {
1665 			ctx->cached_cq_overflow++;
1666 			WRITE_ONCE(ctx->rings->cq_overflow,
1667 				   ctx->cached_cq_overflow);
1668 		}
1669 	}
1670 
1671 	io_commit_cqring(ctx);
1672 	io_cqring_mark_overflow(ctx);
1673 
1674 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1675 	io_cqring_ev_posted(ctx);
1676 
1677 	while (!list_empty(&list)) {
1678 		req = list_first_entry(&list, struct io_kiocb, compl.list);
1679 		list_del(&req->compl.list);
1680 		io_put_req(req);
1681 	}
1682 
1683 	return cqe != NULL;
1684 }
1685 
__io_cqring_fill_event(struct io_kiocb * req,long res,long cflags)1686 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1687 {
1688 	struct io_ring_ctx *ctx = req->ctx;
1689 	struct io_uring_cqe *cqe;
1690 
1691 	trace_io_uring_complete(ctx, req->user_data, res);
1692 
1693 	/*
1694 	 * If we can't get a cq entry, userspace overflowed the
1695 	 * submission (by quite a lot). Increment the overflow count in
1696 	 * the ring.
1697 	 */
1698 	cqe = io_get_cqring(ctx);
1699 	if (likely(cqe)) {
1700 		WRITE_ONCE(cqe->user_data, req->user_data);
1701 		WRITE_ONCE(cqe->res, res);
1702 		WRITE_ONCE(cqe->flags, cflags);
1703 	} else if (ctx->cq_overflow_flushed ||
1704 		   atomic_read(&req->task->io_uring->in_idle)) {
1705 		/*
1706 		 * If we're in ring overflow flush mode, or in task cancel mode,
1707 		 * then we cannot store the request for later flushing, we need
1708 		 * to drop it on the floor.
1709 		 */
1710 		ctx->cached_cq_overflow++;
1711 		WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1712 	} else {
1713 		if (list_empty(&ctx->cq_overflow_list)) {
1714 			set_bit(0, &ctx->sq_check_overflow);
1715 			set_bit(0, &ctx->cq_check_overflow);
1716 			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1717 		}
1718 		io_clean_op(req);
1719 		req->result = res;
1720 		req->compl.cflags = cflags;
1721 		refcount_inc(&req->refs);
1722 		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1723 	}
1724 }
1725 
io_cqring_fill_event(struct io_kiocb * req,long res)1726 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1727 {
1728 	__io_cqring_fill_event(req, res, 0);
1729 }
1730 
io_cqring_add_event(struct io_kiocb * req,long res,long cflags)1731 static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1732 {
1733 	struct io_ring_ctx *ctx = req->ctx;
1734 	unsigned long flags;
1735 
1736 	spin_lock_irqsave(&ctx->completion_lock, flags);
1737 	__io_cqring_fill_event(req, res, cflags);
1738 	io_commit_cqring(ctx);
1739 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1740 
1741 	io_cqring_ev_posted(ctx);
1742 }
1743 
io_submit_flush_completions(struct io_comp_state * cs)1744 static void io_submit_flush_completions(struct io_comp_state *cs)
1745 {
1746 	struct io_ring_ctx *ctx = cs->ctx;
1747 
1748 	spin_lock_irq(&ctx->completion_lock);
1749 	while (!list_empty(&cs->list)) {
1750 		struct io_kiocb *req;
1751 
1752 		req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
1753 		list_del(&req->compl.list);
1754 		__io_cqring_fill_event(req, req->result, req->compl.cflags);
1755 
1756 		/*
1757 		 * io_free_req() doesn't care about completion_lock unless one
1758 		 * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
1759 		 * because of a potential deadlock with req->work.fs->lock
1760 		 */
1761 		if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
1762 				 |REQ_F_WORK_INITIALIZED)) {
1763 			spin_unlock_irq(&ctx->completion_lock);
1764 			io_put_req(req);
1765 			spin_lock_irq(&ctx->completion_lock);
1766 		} else {
1767 			io_put_req(req);
1768 		}
1769 	}
1770 	io_commit_cqring(ctx);
1771 	spin_unlock_irq(&ctx->completion_lock);
1772 
1773 	io_cqring_ev_posted(ctx);
1774 	cs->nr = 0;
1775 }
1776 
__io_req_complete(struct io_kiocb * req,long res,unsigned cflags,struct io_comp_state * cs)1777 static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
1778 			      struct io_comp_state *cs)
1779 {
1780 	if (!cs) {
1781 		io_cqring_add_event(req, res, cflags);
1782 		io_put_req(req);
1783 	} else {
1784 		io_clean_op(req);
1785 		req->result = res;
1786 		req->compl.cflags = cflags;
1787 		list_add_tail(&req->compl.list, &cs->list);
1788 		if (++cs->nr >= 32)
1789 			io_submit_flush_completions(cs);
1790 	}
1791 }
1792 
io_req_complete(struct io_kiocb * req,long res)1793 static void io_req_complete(struct io_kiocb *req, long res)
1794 {
1795 	__io_req_complete(req, res, 0, NULL);
1796 }
1797 
io_is_fallback_req(struct io_kiocb * req)1798 static inline bool io_is_fallback_req(struct io_kiocb *req)
1799 {
1800 	return req == (struct io_kiocb *)
1801 			((unsigned long) req->ctx->fallback_req & ~1UL);
1802 }
1803 
io_get_fallback_req(struct io_ring_ctx * ctx)1804 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1805 {
1806 	struct io_kiocb *req;
1807 
1808 	req = ctx->fallback_req;
1809 	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1810 		return req;
1811 
1812 	return NULL;
1813 }
1814 
io_alloc_req(struct io_ring_ctx * ctx,struct io_submit_state * state)1815 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1816 				     struct io_submit_state *state)
1817 {
1818 	if (!state->free_reqs) {
1819 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1820 		size_t sz;
1821 		int ret;
1822 
1823 		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1824 		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1825 
1826 		/*
1827 		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1828 		 * retry single alloc to be on the safe side.
1829 		 */
1830 		if (unlikely(ret <= 0)) {
1831 			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1832 			if (!state->reqs[0])
1833 				goto fallback;
1834 			ret = 1;
1835 		}
1836 		state->free_reqs = ret;
1837 	}
1838 
1839 	state->free_reqs--;
1840 	return state->reqs[state->free_reqs];
1841 fallback:
1842 	return io_get_fallback_req(ctx);
1843 }
1844 
io_put_file(struct io_kiocb * req,struct file * file,bool fixed)1845 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1846 			  bool fixed)
1847 {
1848 	if (fixed)
1849 		percpu_ref_put(req->fixed_file_refs);
1850 	else
1851 		fput(file);
1852 }
1853 
io_dismantle_req(struct io_kiocb * req)1854 static void io_dismantle_req(struct io_kiocb *req)
1855 {
1856 	io_clean_op(req);
1857 
1858 	if (req->async_data)
1859 		kfree(req->async_data);
1860 	if (req->file)
1861 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1862 
1863 	io_req_clean_work(req);
1864 }
1865 
__io_free_req(struct io_kiocb * req)1866 static void __io_free_req(struct io_kiocb *req)
1867 {
1868 	struct io_uring_task *tctx = req->task->io_uring;
1869 	struct io_ring_ctx *ctx = req->ctx;
1870 
1871 	io_dismantle_req(req);
1872 
1873 	percpu_counter_dec(&tctx->inflight);
1874 	if (atomic_read(&tctx->in_idle))
1875 		wake_up(&tctx->wait);
1876 	put_task_struct(req->task);
1877 
1878 	if (likely(!io_is_fallback_req(req)))
1879 		kmem_cache_free(req_cachep, req);
1880 	else
1881 		clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
1882 	percpu_ref_put(&ctx->refs);
1883 }
1884 
io_kill_linked_timeout(struct io_kiocb * req)1885 static void io_kill_linked_timeout(struct io_kiocb *req)
1886 {
1887 	struct io_ring_ctx *ctx = req->ctx;
1888 	struct io_kiocb *link;
1889 	bool cancelled = false;
1890 	unsigned long flags;
1891 
1892 	spin_lock_irqsave(&ctx->completion_lock, flags);
1893 	link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
1894 					link_list);
1895 	/*
1896 	 * Can happen if a linked timeout fired and link had been like
1897 	 * req -> link t-out -> link t-out [-> ...]
1898 	 */
1899 	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1900 		struct io_timeout_data *io = link->async_data;
1901 		int ret;
1902 
1903 		list_del_init(&link->link_list);
1904 		ret = hrtimer_try_to_cancel(&io->timer);
1905 		if (ret != -1) {
1906 			io_cqring_fill_event(link, -ECANCELED);
1907 			io_commit_cqring(ctx);
1908 			cancelled = true;
1909 		}
1910 	}
1911 	req->flags &= ~REQ_F_LINK_TIMEOUT;
1912 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1913 
1914 	if (cancelled) {
1915 		io_cqring_ev_posted(ctx);
1916 		io_put_req(link);
1917 	}
1918 }
1919 
io_req_link_next(struct io_kiocb * req)1920 static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
1921 {
1922 	struct io_kiocb *nxt;
1923 
1924 	/*
1925 	 * The list should never be empty when we are called here. But could
1926 	 * potentially happen if the chain is messed up, check to be on the
1927 	 * safe side.
1928 	 */
1929 	if (unlikely(list_empty(&req->link_list)))
1930 		return NULL;
1931 
1932 	nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1933 	list_del_init(&req->link_list);
1934 	if (!list_empty(&nxt->link_list))
1935 		nxt->flags |= REQ_F_LINK_HEAD;
1936 	return nxt;
1937 }
1938 
1939 /*
1940  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1941  */
io_fail_links(struct io_kiocb * req)1942 static void io_fail_links(struct io_kiocb *req)
1943 {
1944 	struct io_ring_ctx *ctx = req->ctx;
1945 	unsigned long flags;
1946 
1947 	spin_lock_irqsave(&ctx->completion_lock, flags);
1948 	while (!list_empty(&req->link_list)) {
1949 		struct io_kiocb *link = list_first_entry(&req->link_list,
1950 						struct io_kiocb, link_list);
1951 
1952 		list_del_init(&link->link_list);
1953 		trace_io_uring_fail_link(req, link);
1954 
1955 		io_cqring_fill_event(link, -ECANCELED);
1956 
1957 		/*
1958 		 * It's ok to free under spinlock as they're not linked anymore,
1959 		 * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
1960 		 * work.fs->lock.
1961 		 */
1962 		if (link->flags & REQ_F_WORK_INITIALIZED)
1963 			io_put_req_deferred(link, 2);
1964 		else
1965 			io_double_put_req(link);
1966 	}
1967 
1968 	io_commit_cqring(ctx);
1969 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1970 
1971 	io_cqring_ev_posted(ctx);
1972 }
1973 
__io_req_find_next(struct io_kiocb * req)1974 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1975 {
1976 	req->flags &= ~REQ_F_LINK_HEAD;
1977 	if (req->flags & REQ_F_LINK_TIMEOUT)
1978 		io_kill_linked_timeout(req);
1979 
1980 	/*
1981 	 * If LINK is set, we have dependent requests in this chain. If we
1982 	 * didn't fail this request, queue the first one up, moving any other
1983 	 * dependencies to the next request. In case of failure, fail the rest
1984 	 * of the chain.
1985 	 */
1986 	if (likely(!(req->flags & REQ_F_FAIL_LINK)))
1987 		return io_req_link_next(req);
1988 	io_fail_links(req);
1989 	return NULL;
1990 }
1991 
io_req_find_next(struct io_kiocb * req)1992 static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1993 {
1994 	if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1995 		return NULL;
1996 	return __io_req_find_next(req);
1997 }
1998 
io_req_task_work_add(struct io_kiocb * req,bool twa_signal_ok)1999 static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
2000 {
2001 	struct task_struct *tsk = req->task;
2002 	struct io_ring_ctx *ctx = req->ctx;
2003 	enum task_work_notify_mode notify;
2004 	int ret;
2005 
2006 	if (tsk->flags & PF_EXITING)
2007 		return -ESRCH;
2008 
2009 	/*
2010 	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2011 	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2012 	 * processing task_work. There's no reliable way to tell if TWA_RESUME
2013 	 * will do the job.
2014 	 */
2015 	notify = TWA_NONE;
2016 	if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
2017 		notify = TWA_SIGNAL;
2018 
2019 	ret = task_work_add(tsk, &req->task_work, notify);
2020 	if (!ret)
2021 		wake_up_process(tsk);
2022 
2023 	return ret;
2024 }
2025 
__io_req_task_cancel(struct io_kiocb * req,int error)2026 static void __io_req_task_cancel(struct io_kiocb *req, int error)
2027 {
2028 	struct io_ring_ctx *ctx = req->ctx;
2029 
2030 	spin_lock_irq(&ctx->completion_lock);
2031 	io_cqring_fill_event(req, error);
2032 	io_commit_cqring(ctx);
2033 	spin_unlock_irq(&ctx->completion_lock);
2034 
2035 	io_cqring_ev_posted(ctx);
2036 	req_set_fail_links(req);
2037 	io_double_put_req(req);
2038 }
2039 
io_req_task_cancel(struct callback_head * cb)2040 static void io_req_task_cancel(struct callback_head *cb)
2041 {
2042 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2043 	struct io_ring_ctx *ctx = req->ctx;
2044 
2045 	__io_req_task_cancel(req, -ECANCELED);
2046 	percpu_ref_put(&ctx->refs);
2047 }
2048 
__io_req_task_submit(struct io_kiocb * req)2049 static void __io_req_task_submit(struct io_kiocb *req)
2050 {
2051 	struct io_ring_ctx *ctx = req->ctx;
2052 
2053 	if (!__io_sq_thread_acquire_mm(ctx)) {
2054 		mutex_lock(&ctx->uring_lock);
2055 		__io_queue_sqe(req, NULL);
2056 		mutex_unlock(&ctx->uring_lock);
2057 	} else {
2058 		__io_req_task_cancel(req, -EFAULT);
2059 	}
2060 }
2061 
io_req_task_submit(struct callback_head * cb)2062 static void io_req_task_submit(struct callback_head *cb)
2063 {
2064 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2065 	struct io_ring_ctx *ctx = req->ctx;
2066 
2067 	__io_req_task_submit(req);
2068 	percpu_ref_put(&ctx->refs);
2069 }
2070 
io_req_task_queue(struct io_kiocb * req)2071 static void io_req_task_queue(struct io_kiocb *req)
2072 {
2073 	int ret;
2074 
2075 	init_task_work(&req->task_work, io_req_task_submit);
2076 	percpu_ref_get(&req->ctx->refs);
2077 
2078 	ret = io_req_task_work_add(req, true);
2079 	if (unlikely(ret)) {
2080 		struct task_struct *tsk;
2081 
2082 		init_task_work(&req->task_work, io_req_task_cancel);
2083 		tsk = io_wq_get_task(req->ctx->io_wq);
2084 		task_work_add(tsk, &req->task_work, TWA_NONE);
2085 		wake_up_process(tsk);
2086 	}
2087 }
2088 
io_queue_next(struct io_kiocb * req)2089 static void io_queue_next(struct io_kiocb *req)
2090 {
2091 	struct io_kiocb *nxt = io_req_find_next(req);
2092 
2093 	if (nxt)
2094 		io_req_task_queue(nxt);
2095 }
2096 
io_free_req(struct io_kiocb * req)2097 static void io_free_req(struct io_kiocb *req)
2098 {
2099 	io_queue_next(req);
2100 	__io_free_req(req);
2101 }
2102 
2103 struct req_batch {
2104 	void *reqs[IO_IOPOLL_BATCH];
2105 	int to_free;
2106 
2107 	struct task_struct	*task;
2108 	int			task_refs;
2109 };
2110 
io_init_req_batch(struct req_batch * rb)2111 static inline void io_init_req_batch(struct req_batch *rb)
2112 {
2113 	rb->to_free = 0;
2114 	rb->task_refs = 0;
2115 	rb->task = NULL;
2116 }
2117 
__io_req_free_batch_flush(struct io_ring_ctx * ctx,struct req_batch * rb)2118 static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
2119 				      struct req_batch *rb)
2120 {
2121 	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
2122 	percpu_ref_put_many(&ctx->refs, rb->to_free);
2123 	rb->to_free = 0;
2124 }
2125 
io_req_free_batch_finish(struct io_ring_ctx * ctx,struct req_batch * rb)2126 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2127 				     struct req_batch *rb)
2128 {
2129 	if (rb->to_free)
2130 		__io_req_free_batch_flush(ctx, rb);
2131 	if (rb->task) {
2132 		struct io_uring_task *tctx = rb->task->io_uring;
2133 
2134 		percpu_counter_sub(&tctx->inflight, rb->task_refs);
2135 		put_task_struct_many(rb->task, rb->task_refs);
2136 		rb->task = NULL;
2137 	}
2138 }
2139 
io_req_free_batch(struct req_batch * rb,struct io_kiocb * req)2140 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
2141 {
2142 	if (unlikely(io_is_fallback_req(req))) {
2143 		io_free_req(req);
2144 		return;
2145 	}
2146 	if (req->flags & REQ_F_LINK_HEAD)
2147 		io_queue_next(req);
2148 
2149 	if (req->task != rb->task) {
2150 		if (rb->task) {
2151 			struct io_uring_task *tctx = rb->task->io_uring;
2152 
2153 			percpu_counter_sub(&tctx->inflight, rb->task_refs);
2154 			put_task_struct_many(rb->task, rb->task_refs);
2155 		}
2156 		rb->task = req->task;
2157 		rb->task_refs = 0;
2158 	}
2159 	rb->task_refs++;
2160 
2161 	io_dismantle_req(req);
2162 	rb->reqs[rb->to_free++] = req;
2163 	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
2164 		__io_req_free_batch_flush(req->ctx, rb);
2165 }
2166 
2167 /*
2168  * Drop reference to request, return next in chain (if there is one) if this
2169  * was the last reference to this request.
2170  */
io_put_req_find_next(struct io_kiocb * req)2171 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2172 {
2173 	struct io_kiocb *nxt = NULL;
2174 
2175 	if (refcount_dec_and_test(&req->refs)) {
2176 		nxt = io_req_find_next(req);
2177 		__io_free_req(req);
2178 	}
2179 	return nxt;
2180 }
2181 
io_put_req(struct io_kiocb * req)2182 static void io_put_req(struct io_kiocb *req)
2183 {
2184 	if (refcount_dec_and_test(&req->refs))
2185 		io_free_req(req);
2186 }
2187 
io_put_req_deferred_cb(struct callback_head * cb)2188 static void io_put_req_deferred_cb(struct callback_head *cb)
2189 {
2190 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2191 
2192 	io_free_req(req);
2193 }
2194 
io_free_req_deferred(struct io_kiocb * req)2195 static void io_free_req_deferred(struct io_kiocb *req)
2196 {
2197 	int ret;
2198 
2199 	init_task_work(&req->task_work, io_put_req_deferred_cb);
2200 	ret = io_req_task_work_add(req, true);
2201 	if (unlikely(ret)) {
2202 		struct task_struct *tsk;
2203 
2204 		tsk = io_wq_get_task(req->ctx->io_wq);
2205 		task_work_add(tsk, &req->task_work, TWA_NONE);
2206 		wake_up_process(tsk);
2207 	}
2208 }
2209 
io_put_req_deferred(struct io_kiocb * req,int refs)2210 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2211 {
2212 	if (refcount_sub_and_test(refs, &req->refs))
2213 		io_free_req_deferred(req);
2214 }
2215 
io_steal_work(struct io_kiocb * req)2216 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
2217 {
2218 	struct io_kiocb *nxt;
2219 
2220 	/*
2221 	 * A ref is owned by io-wq in which context we're. So, if that's the
2222 	 * last one, it's safe to steal next work. False negatives are Ok,
2223 	 * it just will be re-punted async in io_put_work()
2224 	 */
2225 	if (refcount_read(&req->refs) != 1)
2226 		return NULL;
2227 
2228 	nxt = io_req_find_next(req);
2229 	return nxt ? &nxt->work : NULL;
2230 }
2231 
io_double_put_req(struct io_kiocb * req)2232 static void io_double_put_req(struct io_kiocb *req)
2233 {
2234 	/* drop both submit and complete references */
2235 	if (refcount_sub_and_test(2, &req->refs))
2236 		io_free_req(req);
2237 }
2238 
io_cqring_events(struct io_ring_ctx * ctx,bool noflush)2239 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
2240 {
2241 	struct io_rings *rings = ctx->rings;
2242 
2243 	if (test_bit(0, &ctx->cq_check_overflow)) {
2244 		/*
2245 		 * noflush == true is from the waitqueue handler, just ensure
2246 		 * we wake up the task, and the next invocation will flush the
2247 		 * entries. We cannot safely to it from here.
2248 		 */
2249 		if (noflush && !list_empty(&ctx->cq_overflow_list))
2250 			return -1U;
2251 
2252 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
2253 	}
2254 
2255 	/* See comment at the top of this file */
2256 	smp_rmb();
2257 	return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
2258 }
2259 
io_sqring_entries(struct io_ring_ctx * ctx)2260 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2261 {
2262 	struct io_rings *rings = ctx->rings;
2263 
2264 	/* make sure SQ entry isn't read before tail */
2265 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2266 }
2267 
io_put_kbuf(struct io_kiocb * req,struct io_buffer * kbuf)2268 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2269 {
2270 	unsigned int cflags;
2271 
2272 	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2273 	cflags |= IORING_CQE_F_BUFFER;
2274 	req->flags &= ~REQ_F_BUFFER_SELECTED;
2275 	kfree(kbuf);
2276 	return cflags;
2277 }
2278 
io_put_rw_kbuf(struct io_kiocb * req)2279 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2280 {
2281 	struct io_buffer *kbuf;
2282 
2283 	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2284 	return io_put_kbuf(req, kbuf);
2285 }
2286 
io_run_task_work(void)2287 static inline bool io_run_task_work(void)
2288 {
2289 	/*
2290 	 * Not safe to run on exiting task, and the task_work handling will
2291 	 * not add work to such a task.
2292 	 */
2293 	if (unlikely(current->flags & PF_EXITING))
2294 		return false;
2295 	if (current->task_works) {
2296 		__set_current_state(TASK_RUNNING);
2297 		task_work_run();
2298 		return true;
2299 	}
2300 
2301 	return false;
2302 }
2303 
io_iopoll_queue(struct list_head * again)2304 static void io_iopoll_queue(struct list_head *again)
2305 {
2306 	struct io_kiocb *req;
2307 
2308 	do {
2309 		req = list_first_entry(again, struct io_kiocb, inflight_entry);
2310 		list_del(&req->inflight_entry);
2311 		__io_complete_rw(req, -EAGAIN, 0, NULL);
2312 	} while (!list_empty(again));
2313 }
2314 
2315 /*
2316  * Find and free completed poll iocbs
2317  */
io_iopoll_complete(struct io_ring_ctx * ctx,unsigned int * nr_events,struct list_head * done)2318 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2319 			       struct list_head *done)
2320 {
2321 	struct req_batch rb;
2322 	struct io_kiocb *req;
2323 	LIST_HEAD(again);
2324 
2325 	/* order with ->result store in io_complete_rw_iopoll() */
2326 	smp_rmb();
2327 
2328 	io_init_req_batch(&rb);
2329 	while (!list_empty(done)) {
2330 		int cflags = 0;
2331 
2332 		req = list_first_entry(done, struct io_kiocb, inflight_entry);
2333 		if (READ_ONCE(req->result) == -EAGAIN) {
2334 			req->result = 0;
2335 			req->iopoll_completed = 0;
2336 			list_move_tail(&req->inflight_entry, &again);
2337 			continue;
2338 		}
2339 		list_del(&req->inflight_entry);
2340 
2341 		if (req->flags & REQ_F_BUFFER_SELECTED)
2342 			cflags = io_put_rw_kbuf(req);
2343 
2344 		__io_cqring_fill_event(req, req->result, cflags);
2345 		(*nr_events)++;
2346 
2347 		if (refcount_dec_and_test(&req->refs))
2348 			io_req_free_batch(&rb, req);
2349 	}
2350 
2351 	io_commit_cqring(ctx);
2352 	if (ctx->flags & IORING_SETUP_SQPOLL)
2353 		io_cqring_ev_posted(ctx);
2354 	io_req_free_batch_finish(ctx, &rb);
2355 
2356 	if (!list_empty(&again))
2357 		io_iopoll_queue(&again);
2358 }
2359 
io_do_iopoll(struct io_ring_ctx * ctx,unsigned int * nr_events,long min)2360 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2361 			long min)
2362 {
2363 	struct io_kiocb *req, *tmp;
2364 	LIST_HEAD(done);
2365 	bool spin;
2366 	int ret;
2367 
2368 	/*
2369 	 * Only spin for completions if we don't have multiple devices hanging
2370 	 * off our complete list, and we're under the requested amount.
2371 	 */
2372 	spin = !ctx->poll_multi_file && *nr_events < min;
2373 
2374 	ret = 0;
2375 	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2376 		struct kiocb *kiocb = &req->rw.kiocb;
2377 
2378 		/*
2379 		 * Move completed and retryable entries to our local lists.
2380 		 * If we find a request that requires polling, break out
2381 		 * and complete those lists first, if we have entries there.
2382 		 */
2383 		if (READ_ONCE(req->iopoll_completed)) {
2384 			list_move_tail(&req->inflight_entry, &done);
2385 			continue;
2386 		}
2387 		if (!list_empty(&done))
2388 			break;
2389 
2390 		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2391 		if (ret < 0)
2392 			break;
2393 
2394 		/* iopoll may have completed current req */
2395 		if (READ_ONCE(req->iopoll_completed))
2396 			list_move_tail(&req->inflight_entry, &done);
2397 
2398 		if (ret && spin)
2399 			spin = false;
2400 		ret = 0;
2401 	}
2402 
2403 	if (!list_empty(&done))
2404 		io_iopoll_complete(ctx, nr_events, &done);
2405 
2406 	return ret;
2407 }
2408 
2409 /*
2410  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2411  * non-spinning poll check - we'll still enter the driver poll loop, but only
2412  * as a non-spinning completion check.
2413  */
io_iopoll_getevents(struct io_ring_ctx * ctx,unsigned int * nr_events,long min)2414 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2415 				long min)
2416 {
2417 	while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2418 		int ret;
2419 
2420 		ret = io_do_iopoll(ctx, nr_events, min);
2421 		if (ret < 0)
2422 			return ret;
2423 		if (*nr_events >= min)
2424 			return 0;
2425 	}
2426 
2427 	return 1;
2428 }
2429 
2430 /*
2431  * We can't just wait for polled events to come to us, we have to actively
2432  * find and complete them.
2433  */
io_iopoll_try_reap_events(struct io_ring_ctx * ctx)2434 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2435 {
2436 	if (!(ctx->flags & IORING_SETUP_IOPOLL))
2437 		return;
2438 
2439 	mutex_lock(&ctx->uring_lock);
2440 	while (!list_empty(&ctx->iopoll_list)) {
2441 		unsigned int nr_events = 0;
2442 
2443 		io_do_iopoll(ctx, &nr_events, 0);
2444 
2445 		/* let it sleep and repeat later if can't complete a request */
2446 		if (nr_events == 0)
2447 			break;
2448 		/*
2449 		 * Ensure we allow local-to-the-cpu processing to take place,
2450 		 * in this case we need to ensure that we reap all events.
2451 		 * Also let task_work, etc. to progress by releasing the mutex
2452 		 */
2453 		if (need_resched()) {
2454 			mutex_unlock(&ctx->uring_lock);
2455 			cond_resched();
2456 			mutex_lock(&ctx->uring_lock);
2457 		}
2458 	}
2459 	mutex_unlock(&ctx->uring_lock);
2460 }
2461 
io_iopoll_check(struct io_ring_ctx * ctx,long min)2462 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2463 {
2464 	unsigned int nr_events = 0;
2465 	int iters = 0, ret = 0;
2466 
2467 	/*
2468 	 * We disallow the app entering submit/complete with polling, but we
2469 	 * still need to lock the ring to prevent racing with polled issue
2470 	 * that got punted to a workqueue.
2471 	 */
2472 	mutex_lock(&ctx->uring_lock);
2473 	do {
2474 		/*
2475 		 * Don't enter poll loop if we already have events pending.
2476 		 * If we do, we can potentially be spinning for commands that
2477 		 * already triggered a CQE (eg in error).
2478 		 */
2479 		if (io_cqring_events(ctx, false))
2480 			break;
2481 
2482 		/*
2483 		 * If a submit got punted to a workqueue, we can have the
2484 		 * application entering polling for a command before it gets
2485 		 * issued. That app will hold the uring_lock for the duration
2486 		 * of the poll right here, so we need to take a breather every
2487 		 * now and then to ensure that the issue has a chance to add
2488 		 * the poll to the issued list. Otherwise we can spin here
2489 		 * forever, while the workqueue is stuck trying to acquire the
2490 		 * very same mutex.
2491 		 */
2492 		if (!(++iters & 7)) {
2493 			mutex_unlock(&ctx->uring_lock);
2494 			io_run_task_work();
2495 			mutex_lock(&ctx->uring_lock);
2496 		}
2497 
2498 		ret = io_iopoll_getevents(ctx, &nr_events, min);
2499 		if (ret <= 0)
2500 			break;
2501 		ret = 0;
2502 	} while (min && !nr_events && !need_resched());
2503 
2504 	mutex_unlock(&ctx->uring_lock);
2505 	return ret;
2506 }
2507 
kiocb_end_write(struct io_kiocb * req)2508 static void kiocb_end_write(struct io_kiocb *req)
2509 {
2510 	/*
2511 	 * Tell lockdep we inherited freeze protection from submission
2512 	 * thread.
2513 	 */
2514 	if (req->flags & REQ_F_ISREG) {
2515 		struct inode *inode = file_inode(req->file);
2516 
2517 		__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2518 	}
2519 	file_end_write(req->file);
2520 }
2521 
io_complete_rw_common(struct kiocb * kiocb,long res,struct io_comp_state * cs)2522 static void io_complete_rw_common(struct kiocb *kiocb, long res,
2523 				  struct io_comp_state *cs)
2524 {
2525 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2526 	int cflags = 0;
2527 
2528 	if (kiocb->ki_flags & IOCB_WRITE)
2529 		kiocb_end_write(req);
2530 
2531 	if (res != req->result)
2532 		req_set_fail_links(req);
2533 	if (req->flags & REQ_F_BUFFER_SELECTED)
2534 		cflags = io_put_rw_kbuf(req);
2535 	__io_req_complete(req, res, cflags, cs);
2536 }
2537 
2538 #ifdef CONFIG_BLOCK
io_resubmit_prep(struct io_kiocb * req,int error)2539 static bool io_resubmit_prep(struct io_kiocb *req, int error)
2540 {
2541 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2542 	ssize_t ret = -ECANCELED;
2543 	struct iov_iter iter;
2544 	int rw;
2545 
2546 	if (error) {
2547 		ret = error;
2548 		goto end_req;
2549 	}
2550 
2551 	switch (req->opcode) {
2552 	case IORING_OP_READV:
2553 	case IORING_OP_READ_FIXED:
2554 	case IORING_OP_READ:
2555 		rw = READ;
2556 		break;
2557 	case IORING_OP_WRITEV:
2558 	case IORING_OP_WRITE_FIXED:
2559 	case IORING_OP_WRITE:
2560 		rw = WRITE;
2561 		break;
2562 	default:
2563 		printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2564 				req->opcode);
2565 		goto end_req;
2566 	}
2567 
2568 	if (!req->async_data) {
2569 		ret = io_import_iovec(rw, req, &iovec, &iter, false);
2570 		if (ret < 0)
2571 			goto end_req;
2572 		ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2573 		if (!ret)
2574 			return true;
2575 		kfree(iovec);
2576 	} else {
2577 		return true;
2578 	}
2579 end_req:
2580 	req_set_fail_links(req);
2581 	return false;
2582 }
2583 #endif
2584 
io_rw_reissue(struct io_kiocb * req,long res)2585 static bool io_rw_reissue(struct io_kiocb *req, long res)
2586 {
2587 #ifdef CONFIG_BLOCK
2588 	umode_t mode = file_inode(req->file)->i_mode;
2589 	int ret;
2590 
2591 	if (!S_ISBLK(mode) && !S_ISREG(mode))
2592 		return false;
2593 	if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2594 		return false;
2595 
2596 	ret = io_sq_thread_acquire_mm(req->ctx, req);
2597 
2598 	if (io_resubmit_prep(req, ret)) {
2599 		refcount_inc(&req->refs);
2600 		io_queue_async_work(req);
2601 		return true;
2602 	}
2603 
2604 #endif
2605 	return false;
2606 }
2607 
__io_complete_rw(struct io_kiocb * req,long res,long res2,struct io_comp_state * cs)2608 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2609 			     struct io_comp_state *cs)
2610 {
2611 	if (!io_rw_reissue(req, res))
2612 		io_complete_rw_common(&req->rw.kiocb, res, cs);
2613 }
2614 
io_complete_rw(struct kiocb * kiocb,long res,long res2)2615 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2616 {
2617 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2618 
2619 	__io_complete_rw(req, res, res2, NULL);
2620 }
2621 
io_complete_rw_iopoll(struct kiocb * kiocb,long res,long res2)2622 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2623 {
2624 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2625 
2626 	if (kiocb->ki_flags & IOCB_WRITE)
2627 		kiocb_end_write(req);
2628 
2629 	if (res != -EAGAIN && res != req->result)
2630 		req_set_fail_links(req);
2631 
2632 	WRITE_ONCE(req->result, res);
2633 	/* order with io_poll_complete() checking ->result */
2634 	smp_wmb();
2635 	WRITE_ONCE(req->iopoll_completed, 1);
2636 }
2637 
2638 /*
2639  * After the iocb has been issued, it's safe to be found on the poll list.
2640  * Adding the kiocb to the list AFTER submission ensures that we don't
2641  * find it from a io_iopoll_getevents() thread before the issuer is done
2642  * accessing the kiocb cookie.
2643  */
io_iopoll_req_issued(struct io_kiocb * req)2644 static void io_iopoll_req_issued(struct io_kiocb *req)
2645 {
2646 	struct io_ring_ctx *ctx = req->ctx;
2647 
2648 	/*
2649 	 * Track whether we have multiple files in our lists. This will impact
2650 	 * how we do polling eventually, not spinning if we're on potentially
2651 	 * different devices.
2652 	 */
2653 	if (list_empty(&ctx->iopoll_list)) {
2654 		ctx->poll_multi_file = false;
2655 	} else if (!ctx->poll_multi_file) {
2656 		struct io_kiocb *list_req;
2657 
2658 		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2659 						inflight_entry);
2660 		if (list_req->file != req->file)
2661 			ctx->poll_multi_file = true;
2662 	}
2663 
2664 	/*
2665 	 * For fast devices, IO may have already completed. If it has, add
2666 	 * it to the front so we find it first.
2667 	 */
2668 	if (READ_ONCE(req->iopoll_completed))
2669 		list_add(&req->inflight_entry, &ctx->iopoll_list);
2670 	else
2671 		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2672 
2673 	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2674 	    wq_has_sleeper(&ctx->sq_data->wait))
2675 		wake_up(&ctx->sq_data->wait);
2676 }
2677 
__io_state_file_put(struct io_submit_state * state)2678 static void __io_state_file_put(struct io_submit_state *state)
2679 {
2680 	if (state->has_refs)
2681 		fput_many(state->file, state->has_refs);
2682 	state->file = NULL;
2683 }
2684 
io_state_file_put(struct io_submit_state * state)2685 static inline void io_state_file_put(struct io_submit_state *state)
2686 {
2687 	if (state->file)
2688 		__io_state_file_put(state);
2689 }
2690 
2691 /*
2692  * Get as many references to a file as we have IOs left in this submission,
2693  * assuming most submissions are for one file, or at least that each file
2694  * has more than one submission.
2695  */
__io_file_get(struct io_submit_state * state,int fd)2696 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2697 {
2698 	if (!state)
2699 		return fget(fd);
2700 
2701 	if (state->file) {
2702 		if (state->fd == fd) {
2703 			state->has_refs--;
2704 			return state->file;
2705 		}
2706 		__io_state_file_put(state);
2707 	}
2708 	state->file = fget_many(fd, state->ios_left);
2709 	if (!state->file)
2710 		return NULL;
2711 
2712 	state->fd = fd;
2713 	state->has_refs = state->ios_left - 1;
2714 	return state->file;
2715 }
2716 
io_bdev_nowait(struct block_device * bdev)2717 static bool io_bdev_nowait(struct block_device *bdev)
2718 {
2719 #ifdef CONFIG_BLOCK
2720 	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2721 #else
2722 	return true;
2723 #endif
2724 }
2725 
2726 /*
2727  * If we tracked the file through the SCM inflight mechanism, we could support
2728  * any file. For now, just ensure that anything potentially problematic is done
2729  * inline.
2730  */
io_file_supports_async(struct file * file,int rw)2731 static bool io_file_supports_async(struct file *file, int rw)
2732 {
2733 	umode_t mode = file_inode(file)->i_mode;
2734 
2735 	if (S_ISBLK(mode)) {
2736 		if (io_bdev_nowait(file->f_inode->i_bdev))
2737 			return true;
2738 		return false;
2739 	}
2740 	if (S_ISCHR(mode) || S_ISSOCK(mode))
2741 		return true;
2742 	if (S_ISREG(mode)) {
2743 		if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2744 		    file->f_op != &io_uring_fops)
2745 			return true;
2746 		return false;
2747 	}
2748 
2749 	/* any ->read/write should understand O_NONBLOCK */
2750 	if (file->f_flags & O_NONBLOCK)
2751 		return true;
2752 
2753 	if (!(file->f_mode & FMODE_NOWAIT))
2754 		return false;
2755 
2756 	if (rw == READ)
2757 		return file->f_op->read_iter != NULL;
2758 
2759 	return file->f_op->write_iter != NULL;
2760 }
2761 
io_prep_rw(struct io_kiocb * req,const struct io_uring_sqe * sqe)2762 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2763 {
2764 	struct io_ring_ctx *ctx = req->ctx;
2765 	struct kiocb *kiocb = &req->rw.kiocb;
2766 	unsigned ioprio;
2767 	int ret;
2768 
2769 	if (S_ISREG(file_inode(req->file)->i_mode))
2770 		req->flags |= REQ_F_ISREG;
2771 
2772 	kiocb->ki_pos = READ_ONCE(sqe->off);
2773 	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2774 		req->flags |= REQ_F_CUR_POS;
2775 		kiocb->ki_pos = req->file->f_pos;
2776 	}
2777 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2778 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2779 	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2780 	if (unlikely(ret))
2781 		return ret;
2782 
2783 	ioprio = READ_ONCE(sqe->ioprio);
2784 	if (ioprio) {
2785 		ret = ioprio_check_cap(ioprio);
2786 		if (ret)
2787 			return ret;
2788 
2789 		kiocb->ki_ioprio = ioprio;
2790 	} else
2791 		kiocb->ki_ioprio = get_current_ioprio();
2792 
2793 	/* don't allow async punt if RWF_NOWAIT was requested */
2794 	if (kiocb->ki_flags & IOCB_NOWAIT)
2795 		req->flags |= REQ_F_NOWAIT;
2796 
2797 	if (ctx->flags & IORING_SETUP_IOPOLL) {
2798 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2799 		    !kiocb->ki_filp->f_op->iopoll)
2800 			return -EOPNOTSUPP;
2801 
2802 		kiocb->ki_flags |= IOCB_HIPRI;
2803 		kiocb->ki_complete = io_complete_rw_iopoll;
2804 		req->iopoll_completed = 0;
2805 	} else {
2806 		if (kiocb->ki_flags & IOCB_HIPRI)
2807 			return -EINVAL;
2808 		kiocb->ki_complete = io_complete_rw;
2809 	}
2810 
2811 	req->rw.addr = READ_ONCE(sqe->addr);
2812 	req->rw.len = READ_ONCE(sqe->len);
2813 	req->buf_index = READ_ONCE(sqe->buf_index);
2814 	return 0;
2815 }
2816 
io_rw_done(struct kiocb * kiocb,ssize_t ret)2817 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2818 {
2819 	switch (ret) {
2820 	case -EIOCBQUEUED:
2821 		break;
2822 	case -ERESTARTSYS:
2823 	case -ERESTARTNOINTR:
2824 	case -ERESTARTNOHAND:
2825 	case -ERESTART_RESTARTBLOCK:
2826 		/*
2827 		 * We can't just restart the syscall, since previously
2828 		 * submitted sqes may already be in progress. Just fail this
2829 		 * IO with EINTR.
2830 		 */
2831 		ret = -EINTR;
2832 		fallthrough;
2833 	default:
2834 		kiocb->ki_complete(kiocb, ret, 0);
2835 	}
2836 }
2837 
kiocb_done(struct kiocb * kiocb,ssize_t ret,struct io_comp_state * cs)2838 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2839 		       struct io_comp_state *cs)
2840 {
2841 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2842 	struct io_async_rw *io = req->async_data;
2843 
2844 	/* add previously done IO, if any */
2845 	if (io && io->bytes_done > 0) {
2846 		if (ret < 0)
2847 			ret = io->bytes_done;
2848 		else
2849 			ret += io->bytes_done;
2850 	}
2851 
2852 	if (req->flags & REQ_F_CUR_POS)
2853 		req->file->f_pos = kiocb->ki_pos;
2854 	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2855 		__io_complete_rw(req, ret, 0, cs);
2856 	else
2857 		io_rw_done(kiocb, ret);
2858 }
2859 
io_import_fixed(struct io_kiocb * req,int rw,struct iov_iter * iter)2860 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2861 			       struct iov_iter *iter)
2862 {
2863 	struct io_ring_ctx *ctx = req->ctx;
2864 	size_t len = req->rw.len;
2865 	struct io_mapped_ubuf *imu;
2866 	u16 index, buf_index = req->buf_index;
2867 	size_t offset;
2868 	u64 buf_addr;
2869 
2870 	if (unlikely(buf_index >= ctx->nr_user_bufs))
2871 		return -EFAULT;
2872 	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2873 	imu = &ctx->user_bufs[index];
2874 	buf_addr = req->rw.addr;
2875 
2876 	/* overflow */
2877 	if (buf_addr + len < buf_addr)
2878 		return -EFAULT;
2879 	/* not inside the mapped region */
2880 	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2881 		return -EFAULT;
2882 
2883 	/*
2884 	 * May not be a start of buffer, set size appropriately
2885 	 * and advance us to the beginning.
2886 	 */
2887 	offset = buf_addr - imu->ubuf;
2888 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2889 
2890 	if (offset) {
2891 		/*
2892 		 * Don't use iov_iter_advance() here, as it's really slow for
2893 		 * using the latter parts of a big fixed buffer - it iterates
2894 		 * over each segment manually. We can cheat a bit here, because
2895 		 * we know that:
2896 		 *
2897 		 * 1) it's a BVEC iter, we set it up
2898 		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2899 		 *    first and last bvec
2900 		 *
2901 		 * So just find our index, and adjust the iterator afterwards.
2902 		 * If the offset is within the first bvec (or the whole first
2903 		 * bvec, just use iov_iter_advance(). This makes it easier
2904 		 * since we can just skip the first segment, which may not
2905 		 * be PAGE_SIZE aligned.
2906 		 */
2907 		const struct bio_vec *bvec = imu->bvec;
2908 
2909 		if (offset <= bvec->bv_len) {
2910 			iov_iter_advance(iter, offset);
2911 		} else {
2912 			unsigned long seg_skip;
2913 
2914 			/* skip first vec */
2915 			offset -= bvec->bv_len;
2916 			seg_skip = 1 + (offset >> PAGE_SHIFT);
2917 
2918 			iter->bvec = bvec + seg_skip;
2919 			iter->nr_segs -= seg_skip;
2920 			iter->count -= bvec->bv_len + offset;
2921 			iter->iov_offset = offset & ~PAGE_MASK;
2922 		}
2923 	}
2924 
2925 	return len;
2926 }
2927 
io_ring_submit_unlock(struct io_ring_ctx * ctx,bool needs_lock)2928 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2929 {
2930 	if (needs_lock)
2931 		mutex_unlock(&ctx->uring_lock);
2932 }
2933 
io_ring_submit_lock(struct io_ring_ctx * ctx,bool needs_lock)2934 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2935 {
2936 	/*
2937 	 * "Normal" inline submissions always hold the uring_lock, since we
2938 	 * grab it from the system call. Same is true for the SQPOLL offload.
2939 	 * The only exception is when we've detached the request and issue it
2940 	 * from an async worker thread, grab the lock for that case.
2941 	 */
2942 	if (needs_lock)
2943 		mutex_lock(&ctx->uring_lock);
2944 }
2945 
io_buffer_select(struct io_kiocb * req,size_t * len,int bgid,struct io_buffer * kbuf,bool needs_lock)2946 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2947 					  int bgid, struct io_buffer *kbuf,
2948 					  bool needs_lock)
2949 {
2950 	struct io_buffer *head;
2951 
2952 	if (req->flags & REQ_F_BUFFER_SELECTED)
2953 		return kbuf;
2954 
2955 	io_ring_submit_lock(req->ctx, needs_lock);
2956 
2957 	lockdep_assert_held(&req->ctx->uring_lock);
2958 
2959 	head = idr_find(&req->ctx->io_buffer_idr, bgid);
2960 	if (head) {
2961 		if (!list_empty(&head->list)) {
2962 			kbuf = list_last_entry(&head->list, struct io_buffer,
2963 							list);
2964 			list_del(&kbuf->list);
2965 		} else {
2966 			kbuf = head;
2967 			idr_remove(&req->ctx->io_buffer_idr, bgid);
2968 		}
2969 		if (*len > kbuf->len)
2970 			*len = kbuf->len;
2971 	} else {
2972 		kbuf = ERR_PTR(-ENOBUFS);
2973 	}
2974 
2975 	io_ring_submit_unlock(req->ctx, needs_lock);
2976 
2977 	return kbuf;
2978 }
2979 
io_rw_buffer_select(struct io_kiocb * req,size_t * len,bool needs_lock)2980 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2981 					bool needs_lock)
2982 {
2983 	struct io_buffer *kbuf;
2984 	u16 bgid;
2985 
2986 	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2987 	bgid = req->buf_index;
2988 	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2989 	if (IS_ERR(kbuf))
2990 		return kbuf;
2991 	req->rw.addr = (u64) (unsigned long) kbuf;
2992 	req->flags |= REQ_F_BUFFER_SELECTED;
2993 	return u64_to_user_ptr(kbuf->addr);
2994 }
2995 
2996 #ifdef CONFIG_COMPAT
io_compat_import(struct io_kiocb * req,struct iovec * iov,bool needs_lock)2997 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2998 				bool needs_lock)
2999 {
3000 	struct compat_iovec __user *uiov;
3001 	compat_ssize_t clen;
3002 	void __user *buf;
3003 	ssize_t len;
3004 
3005 	uiov = u64_to_user_ptr(req->rw.addr);
3006 	if (!access_ok(uiov, sizeof(*uiov)))
3007 		return -EFAULT;
3008 	if (__get_user(clen, &uiov->iov_len))
3009 		return -EFAULT;
3010 	if (clen < 0)
3011 		return -EINVAL;
3012 
3013 	len = clen;
3014 	buf = io_rw_buffer_select(req, &len, needs_lock);
3015 	if (IS_ERR(buf))
3016 		return PTR_ERR(buf);
3017 	iov[0].iov_base = buf;
3018 	iov[0].iov_len = (compat_size_t) len;
3019 	return 0;
3020 }
3021 #endif
3022 
__io_iov_buffer_select(struct io_kiocb * req,struct iovec * iov,bool needs_lock)3023 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3024 				      bool needs_lock)
3025 {
3026 	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3027 	void __user *buf;
3028 	ssize_t len;
3029 
3030 	if (copy_from_user(iov, uiov, sizeof(*uiov)))
3031 		return -EFAULT;
3032 
3033 	len = iov[0].iov_len;
3034 	if (len < 0)
3035 		return -EINVAL;
3036 	buf = io_rw_buffer_select(req, &len, needs_lock);
3037 	if (IS_ERR(buf))
3038 		return PTR_ERR(buf);
3039 	iov[0].iov_base = buf;
3040 	iov[0].iov_len = len;
3041 	return 0;
3042 }
3043 
io_iov_buffer_select(struct io_kiocb * req,struct iovec * iov,bool needs_lock)3044 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3045 				    bool needs_lock)
3046 {
3047 	if (req->flags & REQ_F_BUFFER_SELECTED) {
3048 		struct io_buffer *kbuf;
3049 
3050 		kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3051 		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3052 		iov[0].iov_len = kbuf->len;
3053 		return 0;
3054 	}
3055 	if (!req->rw.len)
3056 		return 0;
3057 	else if (req->rw.len > 1)
3058 		return -EINVAL;
3059 
3060 #ifdef CONFIG_COMPAT
3061 	if (req->ctx->compat)
3062 		return io_compat_import(req, iov, needs_lock);
3063 #endif
3064 
3065 	return __io_iov_buffer_select(req, iov, needs_lock);
3066 }
3067 
__io_import_iovec(int rw,struct io_kiocb * req,struct iovec ** iovec,struct iov_iter * iter,bool needs_lock)3068 static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
3069 				 struct iovec **iovec, struct iov_iter *iter,
3070 				 bool needs_lock)
3071 {
3072 	void __user *buf = u64_to_user_ptr(req->rw.addr);
3073 	size_t sqe_len = req->rw.len;
3074 	ssize_t ret;
3075 	u8 opcode;
3076 
3077 	opcode = req->opcode;
3078 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3079 		*iovec = NULL;
3080 		return io_import_fixed(req, rw, iter);
3081 	}
3082 
3083 	/* buffer index only valid with fixed read/write, or buffer select  */
3084 	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3085 		return -EINVAL;
3086 
3087 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3088 		if (req->flags & REQ_F_BUFFER_SELECT) {
3089 			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3090 			if (IS_ERR(buf))
3091 				return PTR_ERR(buf);
3092 			req->rw.len = sqe_len;
3093 		}
3094 
3095 		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3096 		*iovec = NULL;
3097 		return ret < 0 ? ret : sqe_len;
3098 	}
3099 
3100 	if (req->flags & REQ_F_BUFFER_SELECT) {
3101 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
3102 		if (!ret) {
3103 			ret = (*iovec)->iov_len;
3104 			iov_iter_init(iter, rw, *iovec, 1, ret);
3105 		}
3106 		*iovec = NULL;
3107 		return ret;
3108 	}
3109 
3110 	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3111 			      req->ctx->compat);
3112 }
3113 
io_import_iovec(int rw,struct io_kiocb * req,struct iovec ** iovec,struct iov_iter * iter,bool needs_lock)3114 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
3115 			       struct iovec **iovec, struct iov_iter *iter,
3116 			       bool needs_lock)
3117 {
3118 	struct io_async_rw *iorw = req->async_data;
3119 
3120 	if (!iorw)
3121 		return __io_import_iovec(rw, req, iovec, iter, needs_lock);
3122 	*iovec = NULL;
3123 	return iov_iter_count(&iorw->iter);
3124 }
3125 
io_kiocb_ppos(struct kiocb * kiocb)3126 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3127 {
3128 	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3129 }
3130 
3131 /*
3132  * For files that don't have ->read_iter() and ->write_iter(), handle them
3133  * by looping over ->read() or ->write() manually.
3134  */
loop_rw_iter(int rw,struct io_kiocb * req,struct iov_iter * iter)3135 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3136 {
3137 	struct kiocb *kiocb = &req->rw.kiocb;
3138 	struct file *file = req->file;
3139 	ssize_t ret = 0;
3140 
3141 	/*
3142 	 * Don't support polled IO through this interface, and we can't
3143 	 * support non-blocking either. For the latter, this just causes
3144 	 * the kiocb to be handled from an async context.
3145 	 */
3146 	if (kiocb->ki_flags & IOCB_HIPRI)
3147 		return -EOPNOTSUPP;
3148 	if (kiocb->ki_flags & IOCB_NOWAIT)
3149 		return -EAGAIN;
3150 
3151 	while (iov_iter_count(iter)) {
3152 		struct iovec iovec;
3153 		ssize_t nr;
3154 
3155 		if (!iov_iter_is_bvec(iter)) {
3156 			iovec = iov_iter_iovec(iter);
3157 		} else {
3158 			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3159 			iovec.iov_len = req->rw.len;
3160 		}
3161 
3162 		if (rw == READ) {
3163 			nr = file->f_op->read(file, iovec.iov_base,
3164 					      iovec.iov_len, io_kiocb_ppos(kiocb));
3165 		} else {
3166 			nr = file->f_op->write(file, iovec.iov_base,
3167 					       iovec.iov_len, io_kiocb_ppos(kiocb));
3168 		}
3169 
3170 		if (nr < 0) {
3171 			if (!ret)
3172 				ret = nr;
3173 			break;
3174 		}
3175 		ret += nr;
3176 		if (nr != iovec.iov_len)
3177 			break;
3178 		req->rw.len -= nr;
3179 		req->rw.addr += nr;
3180 		iov_iter_advance(iter, nr);
3181 	}
3182 
3183 	return ret;
3184 }
3185 
io_req_map_rw(struct io_kiocb * req,const struct iovec * iovec,const struct iovec * fast_iov,struct iov_iter * iter)3186 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3187 			  const struct iovec *fast_iov, struct iov_iter *iter)
3188 {
3189 	struct io_async_rw *rw = req->async_data;
3190 
3191 	memcpy(&rw->iter, iter, sizeof(*iter));
3192 	rw->free_iovec = iovec;
3193 	rw->bytes_done = 0;
3194 	/* can only be fixed buffers, no need to do anything */
3195 	if (iov_iter_is_bvec(iter))
3196 		return;
3197 	if (!iovec) {
3198 		unsigned iov_off = 0;
3199 
3200 		rw->iter.iov = rw->fast_iov;
3201 		if (iter->iov != fast_iov) {
3202 			iov_off = iter->iov - fast_iov;
3203 			rw->iter.iov += iov_off;
3204 		}
3205 		if (rw->fast_iov != fast_iov)
3206 			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3207 			       sizeof(struct iovec) * iter->nr_segs);
3208 	} else {
3209 		req->flags |= REQ_F_NEED_CLEANUP;
3210 	}
3211 }
3212 
__io_alloc_async_data(struct io_kiocb * req)3213 static inline int __io_alloc_async_data(struct io_kiocb *req)
3214 {
3215 	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3216 	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3217 	return req->async_data == NULL;
3218 }
3219 
io_alloc_async_data(struct io_kiocb * req)3220 static int io_alloc_async_data(struct io_kiocb *req)
3221 {
3222 	if (!io_op_defs[req->opcode].needs_async_data)
3223 		return 0;
3224 
3225 	return  __io_alloc_async_data(req);
3226 }
3227 
io_setup_async_rw(struct io_kiocb * req,const struct iovec * iovec,const struct iovec * fast_iov,struct iov_iter * iter,bool force)3228 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3229 			     const struct iovec *fast_iov,
3230 			     struct iov_iter *iter, bool force)
3231 {
3232 	if (!force && !io_op_defs[req->opcode].needs_async_data)
3233 		return 0;
3234 	if (!req->async_data) {
3235 		if (__io_alloc_async_data(req))
3236 			return -ENOMEM;
3237 
3238 		io_req_map_rw(req, iovec, fast_iov, iter);
3239 	}
3240 	return 0;
3241 }
3242 
io_rw_prep_async(struct io_kiocb * req,int rw)3243 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3244 {
3245 	struct io_async_rw *iorw = req->async_data;
3246 	struct iovec *iov = iorw->fast_iov;
3247 	ssize_t ret;
3248 
3249 	ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
3250 	if (unlikely(ret < 0))
3251 		return ret;
3252 
3253 	iorw->bytes_done = 0;
3254 	iorw->free_iovec = iov;
3255 	if (iov)
3256 		req->flags |= REQ_F_NEED_CLEANUP;
3257 	return 0;
3258 }
3259 
io_read_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3260 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3261 {
3262 	ssize_t ret;
3263 
3264 	ret = io_prep_rw(req, sqe);
3265 	if (ret)
3266 		return ret;
3267 
3268 	if (unlikely(!(req->file->f_mode & FMODE_READ)))
3269 		return -EBADF;
3270 
3271 	/* either don't need iovec imported or already have it */
3272 	if (!req->async_data)
3273 		return 0;
3274 	return io_rw_prep_async(req, READ);
3275 }
3276 
3277 /*
3278  * This is our waitqueue callback handler, registered through lock_page_async()
3279  * when we initially tried to do the IO with the iocb armed our waitqueue.
3280  * This gets called when the page is unlocked, and we generally expect that to
3281  * happen when the page IO is completed and the page is now uptodate. This will
3282  * queue a task_work based retry of the operation, attempting to copy the data
3283  * again. If the latter fails because the page was NOT uptodate, then we will
3284  * do a thread based blocking retry of the operation. That's the unexpected
3285  * slow path.
3286  */
io_async_buf_func(struct wait_queue_entry * wait,unsigned mode,int sync,void * arg)3287 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3288 			     int sync, void *arg)
3289 {
3290 	struct wait_page_queue *wpq;
3291 	struct io_kiocb *req = wait->private;
3292 	struct wait_page_key *key = arg;
3293 	int ret;
3294 
3295 	wpq = container_of(wait, struct wait_page_queue, wait);
3296 
3297 	if (!wake_page_match(wpq, key))
3298 		return 0;
3299 
3300 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3301 	list_del_init(&wait->entry);
3302 
3303 	init_task_work(&req->task_work, io_req_task_submit);
3304 	percpu_ref_get(&req->ctx->refs);
3305 
3306 	/* submit ref gets dropped, acquire a new one */
3307 	refcount_inc(&req->refs);
3308 	ret = io_req_task_work_add(req, true);
3309 	if (unlikely(ret)) {
3310 		struct task_struct *tsk;
3311 
3312 		/* queue just for cancelation */
3313 		init_task_work(&req->task_work, io_req_task_cancel);
3314 		tsk = io_wq_get_task(req->ctx->io_wq);
3315 		task_work_add(tsk, &req->task_work, TWA_NONE);
3316 		wake_up_process(tsk);
3317 	}
3318 	return 1;
3319 }
3320 
3321 /*
3322  * This controls whether a given IO request should be armed for async page
3323  * based retry. If we return false here, the request is handed to the async
3324  * worker threads for retry. If we're doing buffered reads on a regular file,
3325  * we prepare a private wait_page_queue entry and retry the operation. This
3326  * will either succeed because the page is now uptodate and unlocked, or it
3327  * will register a callback when the page is unlocked at IO completion. Through
3328  * that callback, io_uring uses task_work to setup a retry of the operation.
3329  * That retry will attempt the buffered read again. The retry will generally
3330  * succeed, or in rare cases where it fails, we then fall back to using the
3331  * async worker threads for a blocking retry.
3332  */
io_rw_should_retry(struct io_kiocb * req)3333 static bool io_rw_should_retry(struct io_kiocb *req)
3334 {
3335 	struct io_async_rw *rw = req->async_data;
3336 	struct wait_page_queue *wait = &rw->wpq;
3337 	struct kiocb *kiocb = &req->rw.kiocb;
3338 
3339 	/* never retry for NOWAIT, we just complete with -EAGAIN */
3340 	if (req->flags & REQ_F_NOWAIT)
3341 		return false;
3342 
3343 	/* Only for buffered IO */
3344 	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3345 		return false;
3346 
3347 	/*
3348 	 * just use poll if we can, and don't attempt if the fs doesn't
3349 	 * support callback based unlocks
3350 	 */
3351 	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3352 		return false;
3353 
3354 	wait->wait.func = io_async_buf_func;
3355 	wait->wait.private = req;
3356 	wait->wait.flags = 0;
3357 	INIT_LIST_HEAD(&wait->wait.entry);
3358 	kiocb->ki_flags |= IOCB_WAITQ;
3359 	kiocb->ki_flags &= ~IOCB_NOWAIT;
3360 	kiocb->ki_waitq = wait;
3361 	return true;
3362 }
3363 
io_iter_do_read(struct io_kiocb * req,struct iov_iter * iter)3364 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3365 {
3366 	if (req->file->f_op->read_iter)
3367 		return call_read_iter(req->file, &req->rw.kiocb, iter);
3368 	else if (req->file->f_op->read)
3369 		return loop_rw_iter(READ, req, iter);
3370 	else
3371 		return -EINVAL;
3372 }
3373 
io_read(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)3374 static int io_read(struct io_kiocb *req, bool force_nonblock,
3375 		   struct io_comp_state *cs)
3376 {
3377 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3378 	struct kiocb *kiocb = &req->rw.kiocb;
3379 	struct iov_iter __iter, *iter = &__iter;
3380 	struct io_async_rw *rw = req->async_data;
3381 	ssize_t io_size, ret, ret2;
3382 	size_t iov_count;
3383 	bool no_async;
3384 
3385 	if (rw)
3386 		iter = &rw->iter;
3387 
3388 	ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3389 	if (ret < 0)
3390 		return ret;
3391 	iov_count = iov_iter_count(iter);
3392 	io_size = ret;
3393 	req->result = io_size;
3394 	ret = 0;
3395 
3396 	/* Ensure we clear previously set non-block flag */
3397 	if (!force_nonblock)
3398 		kiocb->ki_flags &= ~IOCB_NOWAIT;
3399 	else
3400 		kiocb->ki_flags |= IOCB_NOWAIT;
3401 
3402 
3403 	/* If the file doesn't support async, just async punt */
3404 	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
3405 	if (no_async)
3406 		goto copy_iov;
3407 
3408 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
3409 	if (unlikely(ret))
3410 		goto out_free;
3411 
3412 	ret = io_iter_do_read(req, iter);
3413 
3414 	if (!ret) {
3415 		goto done;
3416 	} else if (ret == -EIOCBQUEUED) {
3417 		ret = 0;
3418 		goto out_free;
3419 	} else if (ret == -EAGAIN) {
3420 		/* IOPOLL retry should happen for io-wq threads */
3421 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3422 			goto done;
3423 		/* no retry on NONBLOCK marked file */
3424 		if (req->file->f_flags & O_NONBLOCK)
3425 			goto done;
3426 		/* some cases will consume bytes even on error returns */
3427 		iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3428 		ret = 0;
3429 		goto copy_iov;
3430 	} else if (ret < 0) {
3431 		/* make sure -ERESTARTSYS -> -EINTR is done */
3432 		goto done;
3433 	}
3434 
3435 	/* read it all, or we did blocking attempt. no retry. */
3436 	if (!iov_iter_count(iter) || !force_nonblock ||
3437 	    (req->file->f_flags & O_NONBLOCK))
3438 		goto done;
3439 
3440 	io_size -= ret;
3441 copy_iov:
3442 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3443 	if (ret2) {
3444 		ret = ret2;
3445 		goto out_free;
3446 	}
3447 	if (no_async)
3448 		return -EAGAIN;
3449 	rw = req->async_data;
3450 	/* it's copied and will be cleaned with ->io */
3451 	iovec = NULL;
3452 	/* now use our persistent iterator, if we aren't already */
3453 	iter = &rw->iter;
3454 retry:
3455 	rw->bytes_done += ret;
3456 	/* if we can retry, do so with the callbacks armed */
3457 	if (!io_rw_should_retry(req)) {
3458 		kiocb->ki_flags &= ~IOCB_WAITQ;
3459 		return -EAGAIN;
3460 	}
3461 
3462 	/*
3463 	 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
3464 	 * get -EIOCBQUEUED, then we'll get a notification when the desired
3465 	 * page gets unlocked. We can also get a partial read here, and if we
3466 	 * do, then just retry at the new offset.
3467 	 */
3468 	ret = io_iter_do_read(req, iter);
3469 	if (ret == -EIOCBQUEUED) {
3470 		ret = 0;
3471 		goto out_free;
3472 	} else if (ret > 0 && ret < io_size) {
3473 		/* we got some bytes, but not all. retry. */
3474 		goto retry;
3475 	}
3476 done:
3477 	kiocb_done(kiocb, ret, cs);
3478 	ret = 0;
3479 out_free:
3480 	/* it's reportedly faster than delegating the null check to kfree() */
3481 	if (iovec)
3482 		kfree(iovec);
3483 	return ret;
3484 }
3485 
io_write_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3486 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3487 {
3488 	ssize_t ret;
3489 
3490 	ret = io_prep_rw(req, sqe);
3491 	if (ret)
3492 		return ret;
3493 
3494 	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3495 		return -EBADF;
3496 
3497 	/* either don't need iovec imported or already have it */
3498 	if (!req->async_data)
3499 		return 0;
3500 	return io_rw_prep_async(req, WRITE);
3501 }
3502 
io_write(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)3503 static int io_write(struct io_kiocb *req, bool force_nonblock,
3504 		    struct io_comp_state *cs)
3505 {
3506 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3507 	struct kiocb *kiocb = &req->rw.kiocb;
3508 	struct iov_iter __iter, *iter = &__iter;
3509 	struct io_async_rw *rw = req->async_data;
3510 	size_t iov_count;
3511 	ssize_t ret, ret2, io_size;
3512 
3513 	if (rw)
3514 		iter = &rw->iter;
3515 
3516 	ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3517 	if (ret < 0)
3518 		return ret;
3519 	iov_count = iov_iter_count(iter);
3520 	io_size = ret;
3521 	req->result = io_size;
3522 
3523 	/* Ensure we clear previously set non-block flag */
3524 	if (!force_nonblock)
3525 		kiocb->ki_flags &= ~IOCB_NOWAIT;
3526 	else
3527 		kiocb->ki_flags |= IOCB_NOWAIT;
3528 
3529 	/* If the file doesn't support async, just async punt */
3530 	if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3531 		goto copy_iov;
3532 
3533 	/* file path doesn't support NOWAIT for non-direct_IO */
3534 	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3535 	    (req->flags & REQ_F_ISREG))
3536 		goto copy_iov;
3537 
3538 	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
3539 	if (unlikely(ret))
3540 		goto out_free;
3541 
3542 	/*
3543 	 * Open-code file_start_write here to grab freeze protection,
3544 	 * which will be released by another thread in
3545 	 * io_complete_rw().  Fool lockdep by telling it the lock got
3546 	 * released so that it doesn't complain about the held lock when
3547 	 * we return to userspace.
3548 	 */
3549 	if (req->flags & REQ_F_ISREG) {
3550 		sb_start_write(file_inode(req->file)->i_sb);
3551 		__sb_writers_release(file_inode(req->file)->i_sb,
3552 					SB_FREEZE_WRITE);
3553 	}
3554 	kiocb->ki_flags |= IOCB_WRITE;
3555 
3556 	if (req->file->f_op->write_iter)
3557 		ret2 = call_write_iter(req->file, kiocb, iter);
3558 	else if (req->file->f_op->write)
3559 		ret2 = loop_rw_iter(WRITE, req, iter);
3560 	else
3561 		ret2 = -EINVAL;
3562 
3563 	/*
3564 	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3565 	 * retry them without IOCB_NOWAIT.
3566 	 */
3567 	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3568 		ret2 = -EAGAIN;
3569 	/* no retry on NONBLOCK marked file */
3570 	if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
3571 		goto done;
3572 	if (!force_nonblock || ret2 != -EAGAIN) {
3573 		/* IOPOLL retry should happen for io-wq threads */
3574 		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3575 			goto copy_iov;
3576 done:
3577 		kiocb_done(kiocb, ret2, cs);
3578 	} else {
3579 copy_iov:
3580 		/* some cases will consume bytes even on error returns */
3581 		iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3582 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3583 		if (!ret)
3584 			return -EAGAIN;
3585 	}
3586 out_free:
3587 	/* it's reportedly faster than delegating the null check to kfree() */
3588 	if (iovec)
3589 		kfree(iovec);
3590 	return ret;
3591 }
3592 
__io_splice_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3593 static int __io_splice_prep(struct io_kiocb *req,
3594 			    const struct io_uring_sqe *sqe)
3595 {
3596 	struct io_splice* sp = &req->splice;
3597 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3598 
3599 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3600 		return -EINVAL;
3601 
3602 	sp->file_in = NULL;
3603 	sp->len = READ_ONCE(sqe->len);
3604 	sp->flags = READ_ONCE(sqe->splice_flags);
3605 
3606 	if (unlikely(sp->flags & ~valid_flags))
3607 		return -EINVAL;
3608 
3609 	sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3610 				  (sp->flags & SPLICE_F_FD_IN_FIXED));
3611 	if (!sp->file_in)
3612 		return -EBADF;
3613 	req->flags |= REQ_F_NEED_CLEANUP;
3614 
3615 	if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3616 		/*
3617 		 * Splice operation will be punted aync, and here need to
3618 		 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3619 		 */
3620 		io_req_init_async(req);
3621 		req->work.flags |= IO_WQ_WORK_UNBOUND;
3622 	}
3623 
3624 	return 0;
3625 }
3626 
io_tee_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3627 static int io_tee_prep(struct io_kiocb *req,
3628 		       const struct io_uring_sqe *sqe)
3629 {
3630 	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3631 		return -EINVAL;
3632 	return __io_splice_prep(req, sqe);
3633 }
3634 
io_tee(struct io_kiocb * req,bool force_nonblock)3635 static int io_tee(struct io_kiocb *req, bool force_nonblock)
3636 {
3637 	struct io_splice *sp = &req->splice;
3638 	struct file *in = sp->file_in;
3639 	struct file *out = sp->file_out;
3640 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3641 	long ret = 0;
3642 
3643 	if (force_nonblock)
3644 		return -EAGAIN;
3645 	if (sp->len)
3646 		ret = do_tee(in, out, sp->len, flags);
3647 
3648 	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3649 	req->flags &= ~REQ_F_NEED_CLEANUP;
3650 
3651 	if (ret != sp->len)
3652 		req_set_fail_links(req);
3653 	io_req_complete(req, ret);
3654 	return 0;
3655 }
3656 
io_splice_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3657 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3658 {
3659 	struct io_splice* sp = &req->splice;
3660 
3661 	sp->off_in = READ_ONCE(sqe->splice_off_in);
3662 	sp->off_out = READ_ONCE(sqe->off);
3663 	return __io_splice_prep(req, sqe);
3664 }
3665 
io_splice(struct io_kiocb * req,bool force_nonblock)3666 static int io_splice(struct io_kiocb *req, bool force_nonblock)
3667 {
3668 	struct io_splice *sp = &req->splice;
3669 	struct file *in = sp->file_in;
3670 	struct file *out = sp->file_out;
3671 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3672 	loff_t *poff_in, *poff_out;
3673 	long ret = 0;
3674 
3675 	if (force_nonblock)
3676 		return -EAGAIN;
3677 
3678 	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3679 	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3680 
3681 	if (sp->len)
3682 		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3683 
3684 	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3685 	req->flags &= ~REQ_F_NEED_CLEANUP;
3686 
3687 	if (ret != sp->len)
3688 		req_set_fail_links(req);
3689 	io_req_complete(req, ret);
3690 	return 0;
3691 }
3692 
3693 /*
3694  * IORING_OP_NOP just posts a completion event, nothing else.
3695  */
io_nop(struct io_kiocb * req,struct io_comp_state * cs)3696 static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
3697 {
3698 	struct io_ring_ctx *ctx = req->ctx;
3699 
3700 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3701 		return -EINVAL;
3702 
3703 	__io_req_complete(req, 0, 0, cs);
3704 	return 0;
3705 }
3706 
io_prep_fsync(struct io_kiocb * req,const struct io_uring_sqe * sqe)3707 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3708 {
3709 	struct io_ring_ctx *ctx = req->ctx;
3710 
3711 	if (!req->file)
3712 		return -EBADF;
3713 
3714 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3715 		return -EINVAL;
3716 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3717 		return -EINVAL;
3718 
3719 	req->sync.flags = READ_ONCE(sqe->fsync_flags);
3720 	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3721 		return -EINVAL;
3722 
3723 	req->sync.off = READ_ONCE(sqe->off);
3724 	req->sync.len = READ_ONCE(sqe->len);
3725 	return 0;
3726 }
3727 
io_fsync(struct io_kiocb * req,bool force_nonblock)3728 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
3729 {
3730 	loff_t end = req->sync.off + req->sync.len;
3731 	int ret;
3732 
3733 	/* fsync always requires a blocking context */
3734 	if (force_nonblock)
3735 		return -EAGAIN;
3736 
3737 	ret = vfs_fsync_range(req->file, req->sync.off,
3738 				end > 0 ? end : LLONG_MAX,
3739 				req->sync.flags & IORING_FSYNC_DATASYNC);
3740 	if (ret < 0)
3741 		req_set_fail_links(req);
3742 	io_req_complete(req, ret);
3743 	return 0;
3744 }
3745 
io_fallocate_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3746 static int io_fallocate_prep(struct io_kiocb *req,
3747 			     const struct io_uring_sqe *sqe)
3748 {
3749 	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3750 		return -EINVAL;
3751 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3752 		return -EINVAL;
3753 
3754 	req->sync.off = READ_ONCE(sqe->off);
3755 	req->sync.len = READ_ONCE(sqe->addr);
3756 	req->sync.mode = READ_ONCE(sqe->len);
3757 	return 0;
3758 }
3759 
io_fallocate(struct io_kiocb * req,bool force_nonblock)3760 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
3761 {
3762 	int ret;
3763 
3764 	/* fallocate always requiring blocking context */
3765 	if (force_nonblock)
3766 		return -EAGAIN;
3767 	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3768 				req->sync.len);
3769 	if (ret < 0)
3770 		req_set_fail_links(req);
3771 	io_req_complete(req, ret);
3772 	return 0;
3773 }
3774 
__io_openat_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3775 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3776 {
3777 	const char __user *fname;
3778 	int ret;
3779 
3780 	if (unlikely(sqe->ioprio || sqe->buf_index))
3781 		return -EINVAL;
3782 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3783 		return -EBADF;
3784 
3785 	/* open.how should be already initialised */
3786 	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3787 		req->open.how.flags |= O_LARGEFILE;
3788 
3789 	req->open.dfd = READ_ONCE(sqe->fd);
3790 	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3791 	req->open.filename = getname(fname);
3792 	if (IS_ERR(req->open.filename)) {
3793 		ret = PTR_ERR(req->open.filename);
3794 		req->open.filename = NULL;
3795 		return ret;
3796 	}
3797 	req->open.nofile = rlimit(RLIMIT_NOFILE);
3798 	req->open.ignore_nonblock = false;
3799 	req->flags |= REQ_F_NEED_CLEANUP;
3800 	return 0;
3801 }
3802 
io_openat_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3803 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3804 {
3805 	u64 flags, mode;
3806 
3807 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3808 		return -EINVAL;
3809 	mode = READ_ONCE(sqe->len);
3810 	flags = READ_ONCE(sqe->open_flags);
3811 	req->open.how = build_open_how(flags, mode);
3812 	return __io_openat_prep(req, sqe);
3813 }
3814 
io_openat2_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3815 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3816 {
3817 	struct open_how __user *how;
3818 	size_t len;
3819 	int ret;
3820 
3821 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3822 		return -EINVAL;
3823 	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3824 	len = READ_ONCE(sqe->len);
3825 	if (len < OPEN_HOW_SIZE_VER0)
3826 		return -EINVAL;
3827 
3828 	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3829 					len);
3830 	if (ret)
3831 		return ret;
3832 
3833 	return __io_openat_prep(req, sqe);
3834 }
3835 
io_openat2(struct io_kiocb * req,bool force_nonblock)3836 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3837 {
3838 	struct open_flags op;
3839 	struct file *file;
3840 	int ret;
3841 
3842 	if (force_nonblock && !req->open.ignore_nonblock)
3843 		return -EAGAIN;
3844 
3845 	ret = build_open_flags(&req->open.how, &op);
3846 	if (ret)
3847 		goto err;
3848 
3849 	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3850 	if (ret < 0)
3851 		goto err;
3852 
3853 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
3854 	if (IS_ERR(file)) {
3855 		put_unused_fd(ret);
3856 		ret = PTR_ERR(file);
3857 		/*
3858 		 * A work-around to ensure that /proc/self works that way
3859 		 * that it should - if we get -EOPNOTSUPP back, then assume
3860 		 * that proc_self_get_link() failed us because we're in async
3861 		 * context. We should be safe to retry this from the task
3862 		 * itself with force_nonblock == false set, as it should not
3863 		 * block on lookup. Would be nice to know this upfront and
3864 		 * avoid the async dance, but doesn't seem feasible.
3865 		 */
3866 		if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
3867 			req->open.ignore_nonblock = true;
3868 			refcount_inc(&req->refs);
3869 			io_req_task_queue(req);
3870 			return 0;
3871 		}
3872 	} else {
3873 		fsnotify_open(file);
3874 		fd_install(ret, file);
3875 	}
3876 err:
3877 	putname(req->open.filename);
3878 	req->flags &= ~REQ_F_NEED_CLEANUP;
3879 	if (ret < 0)
3880 		req_set_fail_links(req);
3881 	io_req_complete(req, ret);
3882 	return 0;
3883 }
3884 
io_openat(struct io_kiocb * req,bool force_nonblock)3885 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3886 {
3887 	return io_openat2(req, force_nonblock);
3888 }
3889 
io_remove_buffers_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3890 static int io_remove_buffers_prep(struct io_kiocb *req,
3891 				  const struct io_uring_sqe *sqe)
3892 {
3893 	struct io_provide_buf *p = &req->pbuf;
3894 	u64 tmp;
3895 
3896 	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3897 		return -EINVAL;
3898 
3899 	tmp = READ_ONCE(sqe->fd);
3900 	if (!tmp || tmp > USHRT_MAX)
3901 		return -EINVAL;
3902 
3903 	memset(p, 0, sizeof(*p));
3904 	p->nbufs = tmp;
3905 	p->bgid = READ_ONCE(sqe->buf_group);
3906 	return 0;
3907 }
3908 
__io_remove_buffers(struct io_ring_ctx * ctx,struct io_buffer * buf,int bgid,unsigned nbufs)3909 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3910 			       int bgid, unsigned nbufs)
3911 {
3912 	unsigned i = 0;
3913 
3914 	/* shouldn't happen */
3915 	if (!nbufs)
3916 		return 0;
3917 
3918 	/* the head kbuf is the list itself */
3919 	while (!list_empty(&buf->list)) {
3920 		struct io_buffer *nxt;
3921 
3922 		nxt = list_first_entry(&buf->list, struct io_buffer, list);
3923 		list_del(&nxt->list);
3924 		kfree(nxt);
3925 		if (++i == nbufs)
3926 			return i;
3927 	}
3928 	i++;
3929 	kfree(buf);
3930 	idr_remove(&ctx->io_buffer_idr, bgid);
3931 
3932 	return i;
3933 }
3934 
io_remove_buffers(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)3935 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
3936 			     struct io_comp_state *cs)
3937 {
3938 	struct io_provide_buf *p = &req->pbuf;
3939 	struct io_ring_ctx *ctx = req->ctx;
3940 	struct io_buffer *head;
3941 	int ret = 0;
3942 
3943 	io_ring_submit_lock(ctx, !force_nonblock);
3944 
3945 	lockdep_assert_held(&ctx->uring_lock);
3946 
3947 	ret = -ENOENT;
3948 	head = idr_find(&ctx->io_buffer_idr, p->bgid);
3949 	if (head)
3950 		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3951 
3952 	io_ring_submit_lock(ctx, !force_nonblock);
3953 	if (ret < 0)
3954 		req_set_fail_links(req);
3955 	__io_req_complete(req, ret, 0, cs);
3956 	return 0;
3957 }
3958 
io_provide_buffers_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)3959 static int io_provide_buffers_prep(struct io_kiocb *req,
3960 				   const struct io_uring_sqe *sqe)
3961 {
3962 	struct io_provide_buf *p = &req->pbuf;
3963 	u64 tmp;
3964 
3965 	if (sqe->ioprio || sqe->rw_flags)
3966 		return -EINVAL;
3967 
3968 	tmp = READ_ONCE(sqe->fd);
3969 	if (!tmp || tmp > USHRT_MAX)
3970 		return -E2BIG;
3971 	p->nbufs = tmp;
3972 	p->addr = READ_ONCE(sqe->addr);
3973 	p->len = READ_ONCE(sqe->len);
3974 
3975 	if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3976 		return -EFAULT;
3977 
3978 	p->bgid = READ_ONCE(sqe->buf_group);
3979 	tmp = READ_ONCE(sqe->off);
3980 	if (tmp > USHRT_MAX)
3981 		return -E2BIG;
3982 	p->bid = tmp;
3983 	return 0;
3984 }
3985 
io_add_buffers(struct io_provide_buf * pbuf,struct io_buffer ** head)3986 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3987 {
3988 	struct io_buffer *buf;
3989 	u64 addr = pbuf->addr;
3990 	int i, bid = pbuf->bid;
3991 
3992 	for (i = 0; i < pbuf->nbufs; i++) {
3993 		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3994 		if (!buf)
3995 			break;
3996 
3997 		buf->addr = addr;
3998 		buf->len = pbuf->len;
3999 		buf->bid = bid;
4000 		addr += pbuf->len;
4001 		bid++;
4002 		if (!*head) {
4003 			INIT_LIST_HEAD(&buf->list);
4004 			*head = buf;
4005 		} else {
4006 			list_add_tail(&buf->list, &(*head)->list);
4007 		}
4008 	}
4009 
4010 	return i ? i : -ENOMEM;
4011 }
4012 
io_provide_buffers(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4013 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
4014 			      struct io_comp_state *cs)
4015 {
4016 	struct io_provide_buf *p = &req->pbuf;
4017 	struct io_ring_ctx *ctx = req->ctx;
4018 	struct io_buffer *head, *list;
4019 	int ret = 0;
4020 
4021 	io_ring_submit_lock(ctx, !force_nonblock);
4022 
4023 	lockdep_assert_held(&ctx->uring_lock);
4024 
4025 	list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
4026 
4027 	ret = io_add_buffers(p, &head);
4028 	if (ret < 0)
4029 		goto out;
4030 
4031 	if (!list) {
4032 		ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4033 					GFP_KERNEL);
4034 		if (ret < 0) {
4035 			__io_remove_buffers(ctx, head, p->bgid, -1U);
4036 			goto out;
4037 		}
4038 	}
4039 out:
4040 	io_ring_submit_unlock(ctx, !force_nonblock);
4041 	if (ret < 0)
4042 		req_set_fail_links(req);
4043 	__io_req_complete(req, ret, 0, cs);
4044 	return 0;
4045 }
4046 
io_epoll_ctl_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4047 static int io_epoll_ctl_prep(struct io_kiocb *req,
4048 			     const struct io_uring_sqe *sqe)
4049 {
4050 #if defined(CONFIG_EPOLL)
4051 	if (sqe->ioprio || sqe->buf_index)
4052 		return -EINVAL;
4053 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4054 		return -EINVAL;
4055 
4056 	req->epoll.epfd = READ_ONCE(sqe->fd);
4057 	req->epoll.op = READ_ONCE(sqe->len);
4058 	req->epoll.fd = READ_ONCE(sqe->off);
4059 
4060 	if (ep_op_has_event(req->epoll.op)) {
4061 		struct epoll_event __user *ev;
4062 
4063 		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4064 		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4065 			return -EFAULT;
4066 	}
4067 
4068 	return 0;
4069 #else
4070 	return -EOPNOTSUPP;
4071 #endif
4072 }
4073 
io_epoll_ctl(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4074 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
4075 			struct io_comp_state *cs)
4076 {
4077 #if defined(CONFIG_EPOLL)
4078 	struct io_epoll *ie = &req->epoll;
4079 	int ret;
4080 
4081 	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4082 	if (force_nonblock && ret == -EAGAIN)
4083 		return -EAGAIN;
4084 
4085 	if (ret < 0)
4086 		req_set_fail_links(req);
4087 	__io_req_complete(req, ret, 0, cs);
4088 	return 0;
4089 #else
4090 	return -EOPNOTSUPP;
4091 #endif
4092 }
4093 
io_madvise_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4094 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4095 {
4096 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4097 	if (sqe->ioprio || sqe->buf_index || sqe->off)
4098 		return -EINVAL;
4099 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4100 		return -EINVAL;
4101 
4102 	req->madvise.addr = READ_ONCE(sqe->addr);
4103 	req->madvise.len = READ_ONCE(sqe->len);
4104 	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4105 	return 0;
4106 #else
4107 	return -EOPNOTSUPP;
4108 #endif
4109 }
4110 
io_madvise(struct io_kiocb * req,bool force_nonblock)4111 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
4112 {
4113 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4114 	struct io_madvise *ma = &req->madvise;
4115 	int ret;
4116 
4117 	if (force_nonblock)
4118 		return -EAGAIN;
4119 
4120 	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4121 	if (ret < 0)
4122 		req_set_fail_links(req);
4123 	io_req_complete(req, ret);
4124 	return 0;
4125 #else
4126 	return -EOPNOTSUPP;
4127 #endif
4128 }
4129 
io_fadvise_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4130 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4131 {
4132 	if (sqe->ioprio || sqe->buf_index || sqe->addr)
4133 		return -EINVAL;
4134 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4135 		return -EINVAL;
4136 
4137 	req->fadvise.offset = READ_ONCE(sqe->off);
4138 	req->fadvise.len = READ_ONCE(sqe->len);
4139 	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4140 	return 0;
4141 }
4142 
io_fadvise(struct io_kiocb * req,bool force_nonblock)4143 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
4144 {
4145 	struct io_fadvise *fa = &req->fadvise;
4146 	int ret;
4147 
4148 	if (force_nonblock) {
4149 		switch (fa->advice) {
4150 		case POSIX_FADV_NORMAL:
4151 		case POSIX_FADV_RANDOM:
4152 		case POSIX_FADV_SEQUENTIAL:
4153 			break;
4154 		default:
4155 			return -EAGAIN;
4156 		}
4157 	}
4158 
4159 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4160 	if (ret < 0)
4161 		req_set_fail_links(req);
4162 	io_req_complete(req, ret);
4163 	return 0;
4164 }
4165 
io_statx_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4166 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4167 {
4168 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4169 		return -EINVAL;
4170 	if (sqe->ioprio || sqe->buf_index)
4171 		return -EINVAL;
4172 	if (req->flags & REQ_F_FIXED_FILE)
4173 		return -EBADF;
4174 
4175 	req->statx.dfd = READ_ONCE(sqe->fd);
4176 	req->statx.mask = READ_ONCE(sqe->len);
4177 	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4178 	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4179 	req->statx.flags = READ_ONCE(sqe->statx_flags);
4180 
4181 	return 0;
4182 }
4183 
io_statx(struct io_kiocb * req,bool force_nonblock)4184 static int io_statx(struct io_kiocb *req, bool force_nonblock)
4185 {
4186 	struct io_statx *ctx = &req->statx;
4187 	int ret;
4188 
4189 	if (force_nonblock) {
4190 		/* only need file table for an actual valid fd */
4191 		if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4192 			req->flags |= REQ_F_NO_FILE_TABLE;
4193 		return -EAGAIN;
4194 	}
4195 
4196 	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4197 		       ctx->buffer);
4198 
4199 	if (ret < 0)
4200 		req_set_fail_links(req);
4201 	io_req_complete(req, ret);
4202 	return 0;
4203 }
4204 
io_close_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4205 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4206 {
4207 	/*
4208 	 * If we queue this for async, it must not be cancellable. That would
4209 	 * leave the 'file' in an undeterminate state, and here need to modify
4210 	 * io_wq_work.flags, so initialize io_wq_work firstly.
4211 	 */
4212 	io_req_init_async(req);
4213 	req->work.flags |= IO_WQ_WORK_NO_CANCEL;
4214 
4215 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4216 		return -EINVAL;
4217 	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4218 	    sqe->rw_flags || sqe->buf_index)
4219 		return -EINVAL;
4220 	if (req->flags & REQ_F_FIXED_FILE)
4221 		return -EBADF;
4222 
4223 	req->close.fd = READ_ONCE(sqe->fd);
4224 	if ((req->file && req->file->f_op == &io_uring_fops))
4225 		return -EBADF;
4226 
4227 	req->close.put_file = NULL;
4228 	return 0;
4229 }
4230 
io_close(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4231 static int io_close(struct io_kiocb *req, bool force_nonblock,
4232 		    struct io_comp_state *cs)
4233 {
4234 	struct io_close *close = &req->close;
4235 	int ret;
4236 
4237 	/* might be already done during nonblock submission */
4238 	if (!close->put_file) {
4239 		ret = __close_fd_get_file(close->fd, &close->put_file);
4240 		if (ret < 0)
4241 			return (ret == -ENOENT) ? -EBADF : ret;
4242 	}
4243 
4244 	/* if the file has a flush method, be safe and punt to async */
4245 	if (close->put_file->f_op->flush && force_nonblock) {
4246 		/* was never set, but play safe */
4247 		req->flags &= ~REQ_F_NOWAIT;
4248 		/* avoid grabbing files - we don't need the files */
4249 		req->flags |= REQ_F_NO_FILE_TABLE;
4250 		return -EAGAIN;
4251 	}
4252 
4253 	/* No ->flush() or already async, safely close from here */
4254 	ret = filp_close(close->put_file, req->work.identity->files);
4255 	if (ret < 0)
4256 		req_set_fail_links(req);
4257 	fput(close->put_file);
4258 	close->put_file = NULL;
4259 	__io_req_complete(req, ret, 0, cs);
4260 	return 0;
4261 }
4262 
io_prep_sfr(struct io_kiocb * req,const struct io_uring_sqe * sqe)4263 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4264 {
4265 	struct io_ring_ctx *ctx = req->ctx;
4266 
4267 	if (!req->file)
4268 		return -EBADF;
4269 
4270 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4271 		return -EINVAL;
4272 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4273 		return -EINVAL;
4274 
4275 	req->sync.off = READ_ONCE(sqe->off);
4276 	req->sync.len = READ_ONCE(sqe->len);
4277 	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4278 	return 0;
4279 }
4280 
io_sync_file_range(struct io_kiocb * req,bool force_nonblock)4281 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
4282 {
4283 	int ret;
4284 
4285 	/* sync_file_range always requires a blocking context */
4286 	if (force_nonblock)
4287 		return -EAGAIN;
4288 
4289 	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4290 				req->sync.flags);
4291 	if (ret < 0)
4292 		req_set_fail_links(req);
4293 	io_req_complete(req, ret);
4294 	return 0;
4295 }
4296 
4297 #if defined(CONFIG_NET)
io_setup_async_msg(struct io_kiocb * req,struct io_async_msghdr * kmsg)4298 static int io_setup_async_msg(struct io_kiocb *req,
4299 			      struct io_async_msghdr *kmsg)
4300 {
4301 	struct io_async_msghdr *async_msg = req->async_data;
4302 
4303 	if (async_msg)
4304 		return -EAGAIN;
4305 	if (io_alloc_async_data(req)) {
4306 		if (kmsg->iov != kmsg->fast_iov)
4307 			kfree(kmsg->iov);
4308 		return -ENOMEM;
4309 	}
4310 	async_msg = req->async_data;
4311 	req->flags |= REQ_F_NEED_CLEANUP;
4312 	memcpy(async_msg, kmsg, sizeof(*kmsg));
4313 	return -EAGAIN;
4314 }
4315 
io_sendmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)4316 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4317 			       struct io_async_msghdr *iomsg)
4318 {
4319 	iomsg->iov = iomsg->fast_iov;
4320 	iomsg->msg.msg_name = &iomsg->addr;
4321 	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4322 				   req->sr_msg.msg_flags, &iomsg->iov);
4323 }
4324 
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4325 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4326 {
4327 	struct io_async_msghdr *async_msg = req->async_data;
4328 	struct io_sr_msg *sr = &req->sr_msg;
4329 	int ret;
4330 
4331 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4332 		return -EINVAL;
4333 
4334 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
4335 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4336 	sr->len = READ_ONCE(sqe->len);
4337 
4338 #ifdef CONFIG_COMPAT
4339 	if (req->ctx->compat)
4340 		sr->msg_flags |= MSG_CMSG_COMPAT;
4341 #endif
4342 
4343 	if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
4344 		return 0;
4345 	ret = io_sendmsg_copy_hdr(req, async_msg);
4346 	if (!ret)
4347 		req->flags |= REQ_F_NEED_CLEANUP;
4348 	return ret;
4349 }
4350 
io_sendmsg(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4351 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4352 		      struct io_comp_state *cs)
4353 {
4354 	struct io_async_msghdr iomsg, *kmsg;
4355 	struct socket *sock;
4356 	unsigned flags;
4357 	int ret;
4358 
4359 	sock = sock_from_file(req->file, &ret);
4360 	if (unlikely(!sock))
4361 		return ret;
4362 
4363 	if (req->async_data) {
4364 		kmsg = req->async_data;
4365 		kmsg->msg.msg_name = &kmsg->addr;
4366 		/* if iov is set, it's allocated already */
4367 		if (!kmsg->iov)
4368 			kmsg->iov = kmsg->fast_iov;
4369 		kmsg->msg.msg_iter.iov = kmsg->iov;
4370 	} else {
4371 		ret = io_sendmsg_copy_hdr(req, &iomsg);
4372 		if (ret)
4373 			return ret;
4374 		kmsg = &iomsg;
4375 	}
4376 
4377 	flags = req->sr_msg.msg_flags;
4378 	if (flags & MSG_DONTWAIT)
4379 		req->flags |= REQ_F_NOWAIT;
4380 	else if (force_nonblock)
4381 		flags |= MSG_DONTWAIT;
4382 
4383 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4384 	if (force_nonblock && ret == -EAGAIN)
4385 		return io_setup_async_msg(req, kmsg);
4386 	if (ret == -ERESTARTSYS)
4387 		ret = -EINTR;
4388 
4389 	if (kmsg->iov != kmsg->fast_iov)
4390 		kfree(kmsg->iov);
4391 	req->flags &= ~REQ_F_NEED_CLEANUP;
4392 	if (ret < 0)
4393 		req_set_fail_links(req);
4394 	__io_req_complete(req, ret, 0, cs);
4395 	return 0;
4396 }
4397 
io_send(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4398 static int io_send(struct io_kiocb *req, bool force_nonblock,
4399 		   struct io_comp_state *cs)
4400 {
4401 	struct io_sr_msg *sr = &req->sr_msg;
4402 	struct msghdr msg;
4403 	struct iovec iov;
4404 	struct socket *sock;
4405 	unsigned flags;
4406 	int ret;
4407 
4408 	sock = sock_from_file(req->file, &ret);
4409 	if (unlikely(!sock))
4410 		return ret;
4411 
4412 	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4413 	if (unlikely(ret))
4414 		return ret;
4415 
4416 	msg.msg_name = NULL;
4417 	msg.msg_control = NULL;
4418 	msg.msg_controllen = 0;
4419 	msg.msg_namelen = 0;
4420 
4421 	flags = req->sr_msg.msg_flags;
4422 	if (flags & MSG_DONTWAIT)
4423 		req->flags |= REQ_F_NOWAIT;
4424 	else if (force_nonblock)
4425 		flags |= MSG_DONTWAIT;
4426 
4427 	msg.msg_flags = flags;
4428 	ret = sock_sendmsg(sock, &msg);
4429 	if (force_nonblock && ret == -EAGAIN)
4430 		return -EAGAIN;
4431 	if (ret == -ERESTARTSYS)
4432 		ret = -EINTR;
4433 
4434 	if (ret < 0)
4435 		req_set_fail_links(req);
4436 	__io_req_complete(req, ret, 0, cs);
4437 	return 0;
4438 }
4439 
__io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)4440 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4441 				 struct io_async_msghdr *iomsg)
4442 {
4443 	struct io_sr_msg *sr = &req->sr_msg;
4444 	struct iovec __user *uiov;
4445 	size_t iov_len;
4446 	int ret;
4447 
4448 	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4449 					&iomsg->uaddr, &uiov, &iov_len);
4450 	if (ret)
4451 		return ret;
4452 
4453 	if (req->flags & REQ_F_BUFFER_SELECT) {
4454 		if (iov_len > 1)
4455 			return -EINVAL;
4456 		if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
4457 			return -EFAULT;
4458 		sr->len = iomsg->iov[0].iov_len;
4459 		iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
4460 				sr->len);
4461 		iomsg->iov = NULL;
4462 	} else {
4463 		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4464 				     &iomsg->iov, &iomsg->msg.msg_iter,
4465 				     false);
4466 		if (ret > 0)
4467 			ret = 0;
4468 	}
4469 
4470 	return ret;
4471 }
4472 
4473 #ifdef CONFIG_COMPAT
__io_compat_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)4474 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4475 					struct io_async_msghdr *iomsg)
4476 {
4477 	struct compat_msghdr __user *msg_compat;
4478 	struct io_sr_msg *sr = &req->sr_msg;
4479 	struct compat_iovec __user *uiov;
4480 	compat_uptr_t ptr;
4481 	compat_size_t len;
4482 	int ret;
4483 
4484 	msg_compat = (struct compat_msghdr __user *) sr->umsg;
4485 	ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4486 					&ptr, &len);
4487 	if (ret)
4488 		return ret;
4489 
4490 	uiov = compat_ptr(ptr);
4491 	if (req->flags & REQ_F_BUFFER_SELECT) {
4492 		compat_ssize_t clen;
4493 
4494 		if (len > 1)
4495 			return -EINVAL;
4496 		if (!access_ok(uiov, sizeof(*uiov)))
4497 			return -EFAULT;
4498 		if (__get_user(clen, &uiov->iov_len))
4499 			return -EFAULT;
4500 		if (clen < 0)
4501 			return -EINVAL;
4502 		sr->len = clen;
4503 		iomsg->iov[0].iov_len = clen;
4504 		iomsg->iov = NULL;
4505 	} else {
4506 		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4507 				   UIO_FASTIOV, &iomsg->iov,
4508 				   &iomsg->msg.msg_iter, true);
4509 		if (ret < 0)
4510 			return ret;
4511 	}
4512 
4513 	return 0;
4514 }
4515 #endif
4516 
io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)4517 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4518 			       struct io_async_msghdr *iomsg)
4519 {
4520 	iomsg->msg.msg_name = &iomsg->addr;
4521 	iomsg->iov = iomsg->fast_iov;
4522 
4523 #ifdef CONFIG_COMPAT
4524 	if (req->ctx->compat)
4525 		return __io_compat_recvmsg_copy_hdr(req, iomsg);
4526 #endif
4527 
4528 	return __io_recvmsg_copy_hdr(req, iomsg);
4529 }
4530 
io_recv_buffer_select(struct io_kiocb * req,bool needs_lock)4531 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4532 					       bool needs_lock)
4533 {
4534 	struct io_sr_msg *sr = &req->sr_msg;
4535 	struct io_buffer *kbuf;
4536 
4537 	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4538 	if (IS_ERR(kbuf))
4539 		return kbuf;
4540 
4541 	sr->kbuf = kbuf;
4542 	req->flags |= REQ_F_BUFFER_SELECTED;
4543 	return kbuf;
4544 }
4545 
io_put_recv_kbuf(struct io_kiocb * req)4546 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4547 {
4548 	return io_put_kbuf(req, req->sr_msg.kbuf);
4549 }
4550 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4551 static int io_recvmsg_prep(struct io_kiocb *req,
4552 			   const struct io_uring_sqe *sqe)
4553 {
4554 	struct io_async_msghdr *async_msg = req->async_data;
4555 	struct io_sr_msg *sr = &req->sr_msg;
4556 	int ret;
4557 
4558 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4559 		return -EINVAL;
4560 
4561 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
4562 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4563 	sr->len = READ_ONCE(sqe->len);
4564 	sr->bgid = READ_ONCE(sqe->buf_group);
4565 
4566 #ifdef CONFIG_COMPAT
4567 	if (req->ctx->compat)
4568 		sr->msg_flags |= MSG_CMSG_COMPAT;
4569 #endif
4570 
4571 	if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
4572 		return 0;
4573 	ret = io_recvmsg_copy_hdr(req, async_msg);
4574 	if (!ret)
4575 		req->flags |= REQ_F_NEED_CLEANUP;
4576 	return ret;
4577 }
4578 
io_recvmsg(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4579 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4580 		      struct io_comp_state *cs)
4581 {
4582 	struct io_async_msghdr iomsg, *kmsg;
4583 	struct socket *sock;
4584 	struct io_buffer *kbuf;
4585 	unsigned flags;
4586 	int ret, cflags = 0;
4587 
4588 	sock = sock_from_file(req->file, &ret);
4589 	if (unlikely(!sock))
4590 		return ret;
4591 
4592 	if (req->async_data) {
4593 		kmsg = req->async_data;
4594 		kmsg->msg.msg_name = &kmsg->addr;
4595 		/* if iov is set, it's allocated already */
4596 		if (!kmsg->iov)
4597 			kmsg->iov = kmsg->fast_iov;
4598 		kmsg->msg.msg_iter.iov = kmsg->iov;
4599 	} else {
4600 		ret = io_recvmsg_copy_hdr(req, &iomsg);
4601 		if (ret)
4602 			return ret;
4603 		kmsg = &iomsg;
4604 	}
4605 
4606 	if (req->flags & REQ_F_BUFFER_SELECT) {
4607 		kbuf = io_recv_buffer_select(req, !force_nonblock);
4608 		if (IS_ERR(kbuf))
4609 			return PTR_ERR(kbuf);
4610 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4611 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4612 				1, req->sr_msg.len);
4613 	}
4614 
4615 	flags = req->sr_msg.msg_flags;
4616 	if (flags & MSG_DONTWAIT)
4617 		req->flags |= REQ_F_NOWAIT;
4618 	else if (force_nonblock)
4619 		flags |= MSG_DONTWAIT;
4620 
4621 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4622 					kmsg->uaddr, flags);
4623 	if (force_nonblock && ret == -EAGAIN)
4624 		return io_setup_async_msg(req, kmsg);
4625 	if (ret == -ERESTARTSYS)
4626 		ret = -EINTR;
4627 
4628 	if (req->flags & REQ_F_BUFFER_SELECTED)
4629 		cflags = io_put_recv_kbuf(req);
4630 	if (kmsg->iov != kmsg->fast_iov)
4631 		kfree(kmsg->iov);
4632 	req->flags &= ~REQ_F_NEED_CLEANUP;
4633 	if (ret < 0)
4634 		req_set_fail_links(req);
4635 	__io_req_complete(req, ret, cflags, cs);
4636 	return 0;
4637 }
4638 
io_recv(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4639 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4640 		   struct io_comp_state *cs)
4641 {
4642 	struct io_buffer *kbuf;
4643 	struct io_sr_msg *sr = &req->sr_msg;
4644 	struct msghdr msg;
4645 	void __user *buf = sr->buf;
4646 	struct socket *sock;
4647 	struct iovec iov;
4648 	unsigned flags;
4649 	int ret, cflags = 0;
4650 
4651 	sock = sock_from_file(req->file, &ret);
4652 	if (unlikely(!sock))
4653 		return ret;
4654 
4655 	if (req->flags & REQ_F_BUFFER_SELECT) {
4656 		kbuf = io_recv_buffer_select(req, !force_nonblock);
4657 		if (IS_ERR(kbuf))
4658 			return PTR_ERR(kbuf);
4659 		buf = u64_to_user_ptr(kbuf->addr);
4660 	}
4661 
4662 	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4663 	if (unlikely(ret))
4664 		goto out_free;
4665 
4666 	msg.msg_name = NULL;
4667 	msg.msg_control = NULL;
4668 	msg.msg_controllen = 0;
4669 	msg.msg_namelen = 0;
4670 	msg.msg_iocb = NULL;
4671 	msg.msg_flags = 0;
4672 
4673 	flags = req->sr_msg.msg_flags;
4674 	if (flags & MSG_DONTWAIT)
4675 		req->flags |= REQ_F_NOWAIT;
4676 	else if (force_nonblock)
4677 		flags |= MSG_DONTWAIT;
4678 
4679 	ret = sock_recvmsg(sock, &msg, flags);
4680 	if (force_nonblock && ret == -EAGAIN)
4681 		return -EAGAIN;
4682 	if (ret == -ERESTARTSYS)
4683 		ret = -EINTR;
4684 out_free:
4685 	if (req->flags & REQ_F_BUFFER_SELECTED)
4686 		cflags = io_put_recv_kbuf(req);
4687 	if (ret < 0)
4688 		req_set_fail_links(req);
4689 	__io_req_complete(req, ret, cflags, cs);
4690 	return 0;
4691 }
4692 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4693 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4694 {
4695 	struct io_accept *accept = &req->accept;
4696 
4697 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4698 		return -EINVAL;
4699 	if (sqe->ioprio || sqe->len || sqe->buf_index)
4700 		return -EINVAL;
4701 
4702 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4703 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4704 	accept->flags = READ_ONCE(sqe->accept_flags);
4705 	accept->nofile = rlimit(RLIMIT_NOFILE);
4706 	return 0;
4707 }
4708 
io_accept(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4709 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4710 		     struct io_comp_state *cs)
4711 {
4712 	struct io_accept *accept = &req->accept;
4713 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4714 	int ret;
4715 
4716 	if (req->file->f_flags & O_NONBLOCK)
4717 		req->flags |= REQ_F_NOWAIT;
4718 
4719 	ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4720 					accept->addr_len, accept->flags,
4721 					accept->nofile);
4722 	if (ret == -EAGAIN && force_nonblock)
4723 		return -EAGAIN;
4724 	if (ret < 0) {
4725 		if (ret == -ERESTARTSYS)
4726 			ret = -EINTR;
4727 		req_set_fail_links(req);
4728 	}
4729 	__io_req_complete(req, ret, 0, cs);
4730 	return 0;
4731 }
4732 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4733 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4734 {
4735 	struct io_connect *conn = &req->connect;
4736 	struct io_async_connect *io = req->async_data;
4737 
4738 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4739 		return -EINVAL;
4740 	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4741 		return -EINVAL;
4742 
4743 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4744 	conn->addr_len =  READ_ONCE(sqe->addr2);
4745 
4746 	if (!io)
4747 		return 0;
4748 
4749 	return move_addr_to_kernel(conn->addr, conn->addr_len,
4750 					&io->address);
4751 }
4752 
io_connect(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4753 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4754 		      struct io_comp_state *cs)
4755 {
4756 	struct io_async_connect __io, *io;
4757 	unsigned file_flags;
4758 	int ret;
4759 
4760 	if (req->async_data) {
4761 		io = req->async_data;
4762 	} else {
4763 		ret = move_addr_to_kernel(req->connect.addr,
4764 						req->connect.addr_len,
4765 						&__io.address);
4766 		if (ret)
4767 			goto out;
4768 		io = &__io;
4769 	}
4770 
4771 	file_flags = force_nonblock ? O_NONBLOCK : 0;
4772 
4773 	ret = __sys_connect_file(req->file, &io->address,
4774 					req->connect.addr_len, file_flags);
4775 	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4776 		if (req->async_data)
4777 			return -EAGAIN;
4778 		if (io_alloc_async_data(req)) {
4779 			ret = -ENOMEM;
4780 			goto out;
4781 		}
4782 		io = req->async_data;
4783 		memcpy(req->async_data, &__io, sizeof(__io));
4784 		return -EAGAIN;
4785 	}
4786 	if (ret == -ERESTARTSYS)
4787 		ret = -EINTR;
4788 out:
4789 	if (ret < 0)
4790 		req_set_fail_links(req);
4791 	__io_req_complete(req, ret, 0, cs);
4792 	return 0;
4793 }
4794 #else /* !CONFIG_NET */
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4795 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4796 {
4797 	return -EOPNOTSUPP;
4798 }
4799 
io_sendmsg(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4800 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4801 		      struct io_comp_state *cs)
4802 {
4803 	return -EOPNOTSUPP;
4804 }
4805 
io_send(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4806 static int io_send(struct io_kiocb *req, bool force_nonblock,
4807 		   struct io_comp_state *cs)
4808 {
4809 	return -EOPNOTSUPP;
4810 }
4811 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4812 static int io_recvmsg_prep(struct io_kiocb *req,
4813 			   const struct io_uring_sqe *sqe)
4814 {
4815 	return -EOPNOTSUPP;
4816 }
4817 
io_recvmsg(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4818 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4819 		      struct io_comp_state *cs)
4820 {
4821 	return -EOPNOTSUPP;
4822 }
4823 
io_recv(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4824 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4825 		   struct io_comp_state *cs)
4826 {
4827 	return -EOPNOTSUPP;
4828 }
4829 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4830 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4831 {
4832 	return -EOPNOTSUPP;
4833 }
4834 
io_accept(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4835 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4836 		     struct io_comp_state *cs)
4837 {
4838 	return -EOPNOTSUPP;
4839 }
4840 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)4841 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4842 {
4843 	return -EOPNOTSUPP;
4844 }
4845 
io_connect(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)4846 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4847 		      struct io_comp_state *cs)
4848 {
4849 	return -EOPNOTSUPP;
4850 }
4851 #endif /* CONFIG_NET */
4852 
4853 struct io_poll_table {
4854 	struct poll_table_struct pt;
4855 	struct io_kiocb *req;
4856 	int error;
4857 };
4858 
__io_async_wake(struct io_kiocb * req,struct io_poll_iocb * poll,__poll_t mask,task_work_func_t func)4859 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4860 			   __poll_t mask, task_work_func_t func)
4861 {
4862 	bool twa_signal_ok;
4863 	int ret;
4864 
4865 	/* for instances that support it check for an event match first: */
4866 	if (mask && !(mask & poll->events))
4867 		return 0;
4868 
4869 	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4870 
4871 	list_del_init(&poll->wait.entry);
4872 
4873 	req->result = mask;
4874 	init_task_work(&req->task_work, func);
4875 	percpu_ref_get(&req->ctx->refs);
4876 
4877 	/*
4878 	 * If we using the signalfd wait_queue_head for this wakeup, then
4879 	 * it's not safe to use TWA_SIGNAL as we could be recursing on the
4880 	 * tsk->sighand->siglock on doing the wakeup. Should not be needed
4881 	 * either, as the normal wakeup will suffice.
4882 	 */
4883 	twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
4884 
4885 	/*
4886 	 * If this fails, then the task is exiting. When a task exits, the
4887 	 * work gets canceled, so just cancel this request as well instead
4888 	 * of executing it. We can't safely execute it anyway, as we may not
4889 	 * have the needed state needed for it anyway.
4890 	 */
4891 	ret = io_req_task_work_add(req, twa_signal_ok);
4892 	if (unlikely(ret)) {
4893 		struct task_struct *tsk;
4894 
4895 		WRITE_ONCE(poll->canceled, true);
4896 		tsk = io_wq_get_task(req->ctx->io_wq);
4897 		task_work_add(tsk, &req->task_work, TWA_NONE);
4898 		wake_up_process(tsk);
4899 	}
4900 	return 1;
4901 }
4902 
io_poll_rewait(struct io_kiocb * req,struct io_poll_iocb * poll)4903 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4904 	__acquires(&req->ctx->completion_lock)
4905 {
4906 	struct io_ring_ctx *ctx = req->ctx;
4907 
4908 	if (!req->result && !READ_ONCE(poll->canceled)) {
4909 		struct poll_table_struct pt = { ._key = poll->events };
4910 
4911 		req->result = vfs_poll(req->file, &pt) & poll->events;
4912 	}
4913 
4914 	spin_lock_irq(&ctx->completion_lock);
4915 	if (!req->result && !READ_ONCE(poll->canceled)) {
4916 		add_wait_queue(poll->head, &poll->wait);
4917 		return true;
4918 	}
4919 
4920 	return false;
4921 }
4922 
io_poll_get_double(struct io_kiocb * req)4923 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4924 {
4925 	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4926 	if (req->opcode == IORING_OP_POLL_ADD)
4927 		return req->async_data;
4928 	return req->apoll->double_poll;
4929 }
4930 
io_poll_get_single(struct io_kiocb * req)4931 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4932 {
4933 	if (req->opcode == IORING_OP_POLL_ADD)
4934 		return &req->poll;
4935 	return &req->apoll->poll;
4936 }
4937 
io_poll_remove_double(struct io_kiocb * req)4938 static void io_poll_remove_double(struct io_kiocb *req)
4939 {
4940 	struct io_poll_iocb *poll = io_poll_get_double(req);
4941 
4942 	lockdep_assert_held(&req->ctx->completion_lock);
4943 
4944 	if (poll && poll->head) {
4945 		struct wait_queue_head *head = poll->head;
4946 
4947 		spin_lock(&head->lock);
4948 		list_del_init(&poll->wait.entry);
4949 		if (poll->wait.private)
4950 			refcount_dec(&req->refs);
4951 		poll->head = NULL;
4952 		spin_unlock(&head->lock);
4953 	}
4954 }
4955 
io_poll_complete(struct io_kiocb * req,__poll_t mask,int error)4956 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4957 {
4958 	struct io_ring_ctx *ctx = req->ctx;
4959 
4960 	io_poll_remove_double(req);
4961 	req->poll.done = true;
4962 	io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4963 	io_commit_cqring(ctx);
4964 }
4965 
io_poll_task_func(struct callback_head * cb)4966 static void io_poll_task_func(struct callback_head *cb)
4967 {
4968 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4969 	struct io_ring_ctx *ctx = req->ctx;
4970 	struct io_kiocb *nxt;
4971 
4972 	if (io_poll_rewait(req, &req->poll)) {
4973 		spin_unlock_irq(&ctx->completion_lock);
4974 	} else {
4975 		hash_del(&req->hash_node);
4976 		io_poll_complete(req, req->result, 0);
4977 		spin_unlock_irq(&ctx->completion_lock);
4978 
4979 		nxt = io_put_req_find_next(req);
4980 		io_cqring_ev_posted(ctx);
4981 		if (nxt)
4982 			__io_req_task_submit(nxt);
4983 	}
4984 
4985 	percpu_ref_put(&ctx->refs);
4986 }
4987 
io_poll_double_wake(struct wait_queue_entry * wait,unsigned mode,int sync,void * key)4988 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4989 			       int sync, void *key)
4990 {
4991 	struct io_kiocb *req = wait->private;
4992 	struct io_poll_iocb *poll = io_poll_get_single(req);
4993 	__poll_t mask = key_to_poll(key);
4994 
4995 	/* for instances that support it check for an event match first: */
4996 	if (mask && !(mask & poll->events))
4997 		return 0;
4998 
4999 	list_del_init(&wait->entry);
5000 
5001 	if (poll && poll->head) {
5002 		bool done;
5003 
5004 		spin_lock(&poll->head->lock);
5005 		done = list_empty(&poll->wait.entry);
5006 		if (!done)
5007 			list_del_init(&poll->wait.entry);
5008 		/* make sure double remove sees this as being gone */
5009 		wait->private = NULL;
5010 		spin_unlock(&poll->head->lock);
5011 		if (!done) {
5012 			/* use wait func handler, so it matches the rq type */
5013 			poll->wait.func(&poll->wait, mode, sync, key);
5014 		}
5015 	}
5016 	refcount_dec(&req->refs);
5017 	return 1;
5018 }
5019 
io_init_poll_iocb(struct io_poll_iocb * poll,__poll_t events,wait_queue_func_t wake_func)5020 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5021 			      wait_queue_func_t wake_func)
5022 {
5023 	poll->head = NULL;
5024 	poll->done = false;
5025 	poll->canceled = false;
5026 	poll->events = events;
5027 	INIT_LIST_HEAD(&poll->wait.entry);
5028 	init_waitqueue_func_entry(&poll->wait, wake_func);
5029 }
5030 
__io_queue_proc(struct io_poll_iocb * poll,struct io_poll_table * pt,struct wait_queue_head * head,struct io_poll_iocb ** poll_ptr)5031 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5032 			    struct wait_queue_head *head,
5033 			    struct io_poll_iocb **poll_ptr)
5034 {
5035 	struct io_kiocb *req = pt->req;
5036 
5037 	/*
5038 	 * If poll->head is already set, it's because the file being polled
5039 	 * uses multiple waitqueues for poll handling (eg one for read, one
5040 	 * for write). Setup a separate io_poll_iocb if this happens.
5041 	 */
5042 	if (unlikely(poll->head)) {
5043 		struct io_poll_iocb *poll_one = poll;
5044 
5045 		/* already have a 2nd entry, fail a third attempt */
5046 		if (*poll_ptr) {
5047 			pt->error = -EINVAL;
5048 			return;
5049 		}
5050 		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5051 		if (!poll) {
5052 			pt->error = -ENOMEM;
5053 			return;
5054 		}
5055 		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5056 		refcount_inc(&req->refs);
5057 		poll->wait.private = req;
5058 		*poll_ptr = poll;
5059 	}
5060 
5061 	pt->error = 0;
5062 	poll->head = head;
5063 
5064 	if (poll->events & EPOLLEXCLUSIVE)
5065 		add_wait_queue_exclusive(head, &poll->wait);
5066 	else
5067 		add_wait_queue(head, &poll->wait);
5068 }
5069 
io_async_queue_proc(struct file * file,struct wait_queue_head * head,struct poll_table_struct * p)5070 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5071 			       struct poll_table_struct *p)
5072 {
5073 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5074 	struct async_poll *apoll = pt->req->apoll;
5075 
5076 	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5077 }
5078 
io_async_task_func(struct callback_head * cb)5079 static void io_async_task_func(struct callback_head *cb)
5080 {
5081 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5082 	struct async_poll *apoll = req->apoll;
5083 	struct io_ring_ctx *ctx = req->ctx;
5084 
5085 	trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5086 
5087 	if (io_poll_rewait(req, &apoll->poll)) {
5088 		spin_unlock_irq(&ctx->completion_lock);
5089 		percpu_ref_put(&ctx->refs);
5090 		return;
5091 	}
5092 
5093 	/* If req is still hashed, it cannot have been canceled. Don't check. */
5094 	if (hash_hashed(&req->hash_node))
5095 		hash_del(&req->hash_node);
5096 
5097 	io_poll_remove_double(req);
5098 	spin_unlock_irq(&ctx->completion_lock);
5099 
5100 	if (!READ_ONCE(apoll->poll.canceled))
5101 		__io_req_task_submit(req);
5102 	else
5103 		__io_req_task_cancel(req, -ECANCELED);
5104 
5105 	percpu_ref_put(&ctx->refs);
5106 	kfree(apoll->double_poll);
5107 	kfree(apoll);
5108 }
5109 
io_async_wake(struct wait_queue_entry * wait,unsigned mode,int sync,void * key)5110 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5111 			void *key)
5112 {
5113 	struct io_kiocb *req = wait->private;
5114 	struct io_poll_iocb *poll = &req->apoll->poll;
5115 
5116 	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5117 					key_to_poll(key));
5118 
5119 	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5120 }
5121 
io_poll_req_insert(struct io_kiocb * req)5122 static void io_poll_req_insert(struct io_kiocb *req)
5123 {
5124 	struct io_ring_ctx *ctx = req->ctx;
5125 	struct hlist_head *list;
5126 
5127 	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5128 	hlist_add_head(&req->hash_node, list);
5129 }
5130 
__io_arm_poll_handler(struct io_kiocb * req,struct io_poll_iocb * poll,struct io_poll_table * ipt,__poll_t mask,wait_queue_func_t wake_func)5131 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5132 				      struct io_poll_iocb *poll,
5133 				      struct io_poll_table *ipt, __poll_t mask,
5134 				      wait_queue_func_t wake_func)
5135 	__acquires(&ctx->completion_lock)
5136 {
5137 	struct io_ring_ctx *ctx = req->ctx;
5138 	bool cancel = false;
5139 
5140 	INIT_HLIST_NODE(&req->hash_node);
5141 	io_init_poll_iocb(poll, mask, wake_func);
5142 	poll->file = req->file;
5143 	poll->wait.private = req;
5144 
5145 	ipt->pt._key = mask;
5146 	ipt->req = req;
5147 	ipt->error = -EINVAL;
5148 
5149 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5150 
5151 	spin_lock_irq(&ctx->completion_lock);
5152 	if (likely(poll->head)) {
5153 		spin_lock(&poll->head->lock);
5154 		if (unlikely(list_empty(&poll->wait.entry))) {
5155 			if (ipt->error)
5156 				cancel = true;
5157 			ipt->error = 0;
5158 			mask = 0;
5159 		}
5160 		if (mask || ipt->error)
5161 			list_del_init(&poll->wait.entry);
5162 		else if (cancel)
5163 			WRITE_ONCE(poll->canceled, true);
5164 		else if (!poll->done) /* actually waiting for an event */
5165 			io_poll_req_insert(req);
5166 		spin_unlock(&poll->head->lock);
5167 	}
5168 
5169 	return mask;
5170 }
5171 
io_arm_poll_handler(struct io_kiocb * req)5172 static bool io_arm_poll_handler(struct io_kiocb *req)
5173 {
5174 	const struct io_op_def *def = &io_op_defs[req->opcode];
5175 	struct io_ring_ctx *ctx = req->ctx;
5176 	struct async_poll *apoll;
5177 	struct io_poll_table ipt;
5178 	__poll_t mask, ret;
5179 	int rw;
5180 
5181 	if (!req->file || !file_can_poll(req->file))
5182 		return false;
5183 	if (req->flags & REQ_F_POLLED)
5184 		return false;
5185 	if (def->pollin)
5186 		rw = READ;
5187 	else if (def->pollout)
5188 		rw = WRITE;
5189 	else
5190 		return false;
5191 	/* if we can't nonblock try, then no point in arming a poll handler */
5192 	if (!io_file_supports_async(req->file, rw))
5193 		return false;
5194 
5195 	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5196 	if (unlikely(!apoll))
5197 		return false;
5198 	apoll->double_poll = NULL;
5199 
5200 	req->flags |= REQ_F_POLLED;
5201 	req->apoll = apoll;
5202 
5203 	mask = 0;
5204 	if (def->pollin)
5205 		mask |= POLLIN | POLLRDNORM;
5206 	if (def->pollout)
5207 		mask |= POLLOUT | POLLWRNORM;
5208 
5209 	/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5210 	if ((req->opcode == IORING_OP_RECVMSG) &&
5211 	    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5212 		mask &= ~POLLIN;
5213 
5214 	mask |= POLLERR | POLLPRI;
5215 
5216 	ipt.pt._qproc = io_async_queue_proc;
5217 
5218 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5219 					io_async_wake);
5220 	if (ret || ipt.error) {
5221 		io_poll_remove_double(req);
5222 		spin_unlock_irq(&ctx->completion_lock);
5223 		kfree(apoll->double_poll);
5224 		kfree(apoll);
5225 		return false;
5226 	}
5227 	spin_unlock_irq(&ctx->completion_lock);
5228 	trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5229 					apoll->poll.events);
5230 	return true;
5231 }
5232 
__io_poll_remove_one(struct io_kiocb * req,struct io_poll_iocb * poll)5233 static bool __io_poll_remove_one(struct io_kiocb *req,
5234 				 struct io_poll_iocb *poll)
5235 {
5236 	bool do_complete = false;
5237 
5238 	spin_lock(&poll->head->lock);
5239 	WRITE_ONCE(poll->canceled, true);
5240 	if (!list_empty(&poll->wait.entry)) {
5241 		list_del_init(&poll->wait.entry);
5242 		do_complete = true;
5243 	}
5244 	spin_unlock(&poll->head->lock);
5245 	hash_del(&req->hash_node);
5246 	return do_complete;
5247 }
5248 
io_poll_remove_one(struct io_kiocb * req)5249 static bool io_poll_remove_one(struct io_kiocb *req)
5250 {
5251 	bool do_complete;
5252 
5253 	io_poll_remove_double(req);
5254 
5255 	if (req->opcode == IORING_OP_POLL_ADD) {
5256 		do_complete = __io_poll_remove_one(req, &req->poll);
5257 	} else {
5258 		struct async_poll *apoll = req->apoll;
5259 
5260 		/* non-poll requests have submit ref still */
5261 		do_complete = __io_poll_remove_one(req, &apoll->poll);
5262 		if (do_complete) {
5263 			io_put_req(req);
5264 			kfree(apoll->double_poll);
5265 			kfree(apoll);
5266 		}
5267 	}
5268 
5269 	if (do_complete) {
5270 		io_cqring_fill_event(req, -ECANCELED);
5271 		io_commit_cqring(req->ctx);
5272 		req_set_fail_links(req);
5273 		io_put_req_deferred(req, 1);
5274 	}
5275 
5276 	return do_complete;
5277 }
5278 
5279 /*
5280  * Returns true if we found and killed one or more poll requests
5281  */
io_poll_remove_all(struct io_ring_ctx * ctx,struct task_struct * tsk)5282 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
5283 {
5284 	struct hlist_node *tmp;
5285 	struct io_kiocb *req;
5286 	int posted = 0, i;
5287 
5288 	spin_lock_irq(&ctx->completion_lock);
5289 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5290 		struct hlist_head *list;
5291 
5292 		list = &ctx->cancel_hash[i];
5293 		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5294 			if (io_task_match(req, tsk))
5295 				posted += io_poll_remove_one(req);
5296 		}
5297 	}
5298 	spin_unlock_irq(&ctx->completion_lock);
5299 
5300 	if (posted)
5301 		io_cqring_ev_posted(ctx);
5302 
5303 	return posted != 0;
5304 }
5305 
io_poll_cancel(struct io_ring_ctx * ctx,__u64 sqe_addr)5306 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5307 {
5308 	struct hlist_head *list;
5309 	struct io_kiocb *req;
5310 
5311 	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5312 	hlist_for_each_entry(req, list, hash_node) {
5313 		if (sqe_addr != req->user_data)
5314 			continue;
5315 		if (io_poll_remove_one(req))
5316 			return 0;
5317 		return -EALREADY;
5318 	}
5319 
5320 	return -ENOENT;
5321 }
5322 
io_poll_remove_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5323 static int io_poll_remove_prep(struct io_kiocb *req,
5324 			       const struct io_uring_sqe *sqe)
5325 {
5326 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5327 		return -EINVAL;
5328 	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5329 	    sqe->poll_events)
5330 		return -EINVAL;
5331 
5332 	req->poll.addr = READ_ONCE(sqe->addr);
5333 	return 0;
5334 }
5335 
5336 /*
5337  * Find a running poll command that matches one specified in sqe->addr,
5338  * and remove it if found.
5339  */
io_poll_remove(struct io_kiocb * req)5340 static int io_poll_remove(struct io_kiocb *req)
5341 {
5342 	struct io_ring_ctx *ctx = req->ctx;
5343 	u64 addr;
5344 	int ret;
5345 
5346 	addr = req->poll.addr;
5347 	spin_lock_irq(&ctx->completion_lock);
5348 	ret = io_poll_cancel(ctx, addr);
5349 	spin_unlock_irq(&ctx->completion_lock);
5350 
5351 	if (ret < 0)
5352 		req_set_fail_links(req);
5353 	io_req_complete(req, ret);
5354 	return 0;
5355 }
5356 
io_poll_wake(struct wait_queue_entry * wait,unsigned mode,int sync,void * key)5357 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5358 			void *key)
5359 {
5360 	struct io_kiocb *req = wait->private;
5361 	struct io_poll_iocb *poll = &req->poll;
5362 
5363 	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5364 }
5365 
io_poll_queue_proc(struct file * file,struct wait_queue_head * head,struct poll_table_struct * p)5366 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5367 			       struct poll_table_struct *p)
5368 {
5369 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5370 
5371 	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5372 }
5373 
io_poll_add_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5374 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5375 {
5376 	struct io_poll_iocb *poll = &req->poll;
5377 	u32 events;
5378 
5379 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5380 		return -EINVAL;
5381 	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5382 		return -EINVAL;
5383 
5384 	events = READ_ONCE(sqe->poll32_events);
5385 #ifdef __BIG_ENDIAN
5386 	events = swahw32(events);
5387 #endif
5388 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5389 		       (events & EPOLLEXCLUSIVE);
5390 	return 0;
5391 }
5392 
io_poll_add(struct io_kiocb * req)5393 static int io_poll_add(struct io_kiocb *req)
5394 {
5395 	struct io_poll_iocb *poll = &req->poll;
5396 	struct io_ring_ctx *ctx = req->ctx;
5397 	struct io_poll_table ipt;
5398 	__poll_t mask;
5399 
5400 	ipt.pt._qproc = io_poll_queue_proc;
5401 
5402 	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5403 					io_poll_wake);
5404 
5405 	if (mask) { /* no async, we'd stolen it */
5406 		ipt.error = 0;
5407 		io_poll_complete(req, mask, 0);
5408 	}
5409 	spin_unlock_irq(&ctx->completion_lock);
5410 
5411 	if (mask) {
5412 		io_cqring_ev_posted(ctx);
5413 		io_put_req(req);
5414 	}
5415 	return ipt.error;
5416 }
5417 
io_timeout_fn(struct hrtimer * timer)5418 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5419 {
5420 	struct io_timeout_data *data = container_of(timer,
5421 						struct io_timeout_data, timer);
5422 	struct io_kiocb *req = data->req;
5423 	struct io_ring_ctx *ctx = req->ctx;
5424 	unsigned long flags;
5425 
5426 	spin_lock_irqsave(&ctx->completion_lock, flags);
5427 	list_del_init(&req->timeout.list);
5428 	atomic_set(&req->ctx->cq_timeouts,
5429 		atomic_read(&req->ctx->cq_timeouts) + 1);
5430 
5431 	io_cqring_fill_event(req, -ETIME);
5432 	io_commit_cqring(ctx);
5433 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
5434 
5435 	io_cqring_ev_posted(ctx);
5436 	req_set_fail_links(req);
5437 	io_put_req(req);
5438 	return HRTIMER_NORESTART;
5439 }
5440 
__io_timeout_cancel(struct io_kiocb * req)5441 static int __io_timeout_cancel(struct io_kiocb *req)
5442 {
5443 	struct io_timeout_data *io = req->async_data;
5444 	int ret;
5445 
5446 	ret = hrtimer_try_to_cancel(&io->timer);
5447 	if (ret == -1)
5448 		return -EALREADY;
5449 	list_del_init(&req->timeout.list);
5450 
5451 	req_set_fail_links(req);
5452 	io_cqring_fill_event(req, -ECANCELED);
5453 	io_put_req_deferred(req, 1);
5454 	return 0;
5455 }
5456 
io_timeout_cancel(struct io_ring_ctx * ctx,__u64 user_data)5457 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5458 {
5459 	struct io_kiocb *req;
5460 	int ret = -ENOENT;
5461 
5462 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5463 		if (user_data == req->user_data) {
5464 			ret = 0;
5465 			break;
5466 		}
5467 	}
5468 
5469 	if (ret == -ENOENT)
5470 		return ret;
5471 
5472 	return __io_timeout_cancel(req);
5473 }
5474 
io_timeout_remove_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5475 static int io_timeout_remove_prep(struct io_kiocb *req,
5476 				  const struct io_uring_sqe *sqe)
5477 {
5478 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5479 		return -EINVAL;
5480 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5481 		return -EINVAL;
5482 	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
5483 		return -EINVAL;
5484 
5485 	req->timeout_rem.addr = READ_ONCE(sqe->addr);
5486 	return 0;
5487 }
5488 
5489 /*
5490  * Remove or update an existing timeout command
5491  */
io_timeout_remove(struct io_kiocb * req)5492 static int io_timeout_remove(struct io_kiocb *req)
5493 {
5494 	struct io_ring_ctx *ctx = req->ctx;
5495 	int ret;
5496 
5497 	spin_lock_irq(&ctx->completion_lock);
5498 	ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
5499 
5500 	io_cqring_fill_event(req, ret);
5501 	io_commit_cqring(ctx);
5502 	spin_unlock_irq(&ctx->completion_lock);
5503 	io_cqring_ev_posted(ctx);
5504 	if (ret < 0)
5505 		req_set_fail_links(req);
5506 	io_put_req(req);
5507 	return 0;
5508 }
5509 
io_timeout_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe,bool is_timeout_link)5510 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5511 			   bool is_timeout_link)
5512 {
5513 	struct io_timeout_data *data;
5514 	unsigned flags;
5515 	u32 off = READ_ONCE(sqe->off);
5516 
5517 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5518 		return -EINVAL;
5519 	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5520 		return -EINVAL;
5521 	if (off && is_timeout_link)
5522 		return -EINVAL;
5523 	flags = READ_ONCE(sqe->timeout_flags);
5524 	if (flags & ~IORING_TIMEOUT_ABS)
5525 		return -EINVAL;
5526 
5527 	req->timeout.off = off;
5528 
5529 	if (!req->async_data && io_alloc_async_data(req))
5530 		return -ENOMEM;
5531 
5532 	data = req->async_data;
5533 	data->req = req;
5534 
5535 	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5536 		return -EFAULT;
5537 
5538 	if (flags & IORING_TIMEOUT_ABS)
5539 		data->mode = HRTIMER_MODE_ABS;
5540 	else
5541 		data->mode = HRTIMER_MODE_REL;
5542 
5543 	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5544 	return 0;
5545 }
5546 
io_timeout(struct io_kiocb * req)5547 static int io_timeout(struct io_kiocb *req)
5548 {
5549 	struct io_ring_ctx *ctx = req->ctx;
5550 	struct io_timeout_data *data = req->async_data;
5551 	struct list_head *entry;
5552 	u32 tail, off = req->timeout.off;
5553 
5554 	spin_lock_irq(&ctx->completion_lock);
5555 
5556 	/*
5557 	 * sqe->off holds how many events that need to occur for this
5558 	 * timeout event to be satisfied. If it isn't set, then this is
5559 	 * a pure timeout request, sequence isn't used.
5560 	 */
5561 	if (io_is_timeout_noseq(req)) {
5562 		entry = ctx->timeout_list.prev;
5563 		goto add;
5564 	}
5565 
5566 	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5567 	req->timeout.target_seq = tail + off;
5568 
5569 	/*
5570 	 * Insertion sort, ensuring the first entry in the list is always
5571 	 * the one we need first.
5572 	 */
5573 	list_for_each_prev(entry, &ctx->timeout_list) {
5574 		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5575 						  timeout.list);
5576 
5577 		if (io_is_timeout_noseq(nxt))
5578 			continue;
5579 		/* nxt.seq is behind @tail, otherwise would've been completed */
5580 		if (off >= nxt->timeout.target_seq - tail)
5581 			break;
5582 	}
5583 add:
5584 	list_add(&req->timeout.list, entry);
5585 	data->timer.function = io_timeout_fn;
5586 	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5587 	spin_unlock_irq(&ctx->completion_lock);
5588 	return 0;
5589 }
5590 
io_cancel_cb(struct io_wq_work * work,void * data)5591 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5592 {
5593 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5594 
5595 	return req->user_data == (unsigned long) data;
5596 }
5597 
io_async_cancel_one(struct io_ring_ctx * ctx,void * sqe_addr)5598 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5599 {
5600 	enum io_wq_cancel cancel_ret;
5601 	int ret = 0;
5602 
5603 	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5604 	switch (cancel_ret) {
5605 	case IO_WQ_CANCEL_OK:
5606 		ret = 0;
5607 		break;
5608 	case IO_WQ_CANCEL_RUNNING:
5609 		ret = -EALREADY;
5610 		break;
5611 	case IO_WQ_CANCEL_NOTFOUND:
5612 		ret = -ENOENT;
5613 		break;
5614 	}
5615 
5616 	return ret;
5617 }
5618 
io_async_find_and_cancel(struct io_ring_ctx * ctx,struct io_kiocb * req,__u64 sqe_addr,int success_ret)5619 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5620 				     struct io_kiocb *req, __u64 sqe_addr,
5621 				     int success_ret)
5622 {
5623 	unsigned long flags;
5624 	int ret;
5625 
5626 	ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5627 	if (ret != -ENOENT) {
5628 		spin_lock_irqsave(&ctx->completion_lock, flags);
5629 		goto done;
5630 	}
5631 
5632 	spin_lock_irqsave(&ctx->completion_lock, flags);
5633 	ret = io_timeout_cancel(ctx, sqe_addr);
5634 	if (ret != -ENOENT)
5635 		goto done;
5636 	ret = io_poll_cancel(ctx, sqe_addr);
5637 done:
5638 	if (!ret)
5639 		ret = success_ret;
5640 	io_cqring_fill_event(req, ret);
5641 	io_commit_cqring(ctx);
5642 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
5643 	io_cqring_ev_posted(ctx);
5644 
5645 	if (ret < 0)
5646 		req_set_fail_links(req);
5647 	io_put_req(req);
5648 }
5649 
io_async_cancel_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5650 static int io_async_cancel_prep(struct io_kiocb *req,
5651 				const struct io_uring_sqe *sqe)
5652 {
5653 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5654 		return -EINVAL;
5655 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5656 		return -EINVAL;
5657 	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5658 		return -EINVAL;
5659 
5660 	req->cancel.addr = READ_ONCE(sqe->addr);
5661 	return 0;
5662 }
5663 
io_async_cancel(struct io_kiocb * req)5664 static int io_async_cancel(struct io_kiocb *req)
5665 {
5666 	struct io_ring_ctx *ctx = req->ctx;
5667 
5668 	io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5669 	return 0;
5670 }
5671 
io_files_update_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5672 static int io_files_update_prep(struct io_kiocb *req,
5673 				const struct io_uring_sqe *sqe)
5674 {
5675 	if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5676 		return -EINVAL;
5677 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5678 		return -EINVAL;
5679 	if (sqe->ioprio || sqe->rw_flags)
5680 		return -EINVAL;
5681 
5682 	req->files_update.offset = READ_ONCE(sqe->off);
5683 	req->files_update.nr_args = READ_ONCE(sqe->len);
5684 	if (!req->files_update.nr_args)
5685 		return -EINVAL;
5686 	req->files_update.arg = READ_ONCE(sqe->addr);
5687 	return 0;
5688 }
5689 
io_files_update(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)5690 static int io_files_update(struct io_kiocb *req, bool force_nonblock,
5691 			   struct io_comp_state *cs)
5692 {
5693 	struct io_ring_ctx *ctx = req->ctx;
5694 	struct io_uring_files_update up;
5695 	int ret;
5696 
5697 	if (force_nonblock)
5698 		return -EAGAIN;
5699 
5700 	up.offset = req->files_update.offset;
5701 	up.fds = req->files_update.arg;
5702 
5703 	mutex_lock(&ctx->uring_lock);
5704 	ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5705 	mutex_unlock(&ctx->uring_lock);
5706 
5707 	if (ret < 0)
5708 		req_set_fail_links(req);
5709 	__io_req_complete(req, ret, 0, cs);
5710 	return 0;
5711 }
5712 
io_req_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5713 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5714 {
5715 	switch (req->opcode) {
5716 	case IORING_OP_NOP:
5717 		return 0;
5718 	case IORING_OP_READV:
5719 	case IORING_OP_READ_FIXED:
5720 	case IORING_OP_READ:
5721 		return io_read_prep(req, sqe);
5722 	case IORING_OP_WRITEV:
5723 	case IORING_OP_WRITE_FIXED:
5724 	case IORING_OP_WRITE:
5725 		return io_write_prep(req, sqe);
5726 	case IORING_OP_POLL_ADD:
5727 		return io_poll_add_prep(req, sqe);
5728 	case IORING_OP_POLL_REMOVE:
5729 		return io_poll_remove_prep(req, sqe);
5730 	case IORING_OP_FSYNC:
5731 		return io_prep_fsync(req, sqe);
5732 	case IORING_OP_SYNC_FILE_RANGE:
5733 		return io_prep_sfr(req, sqe);
5734 	case IORING_OP_SENDMSG:
5735 	case IORING_OP_SEND:
5736 		return io_sendmsg_prep(req, sqe);
5737 	case IORING_OP_RECVMSG:
5738 	case IORING_OP_RECV:
5739 		return io_recvmsg_prep(req, sqe);
5740 	case IORING_OP_CONNECT:
5741 		return io_connect_prep(req, sqe);
5742 	case IORING_OP_TIMEOUT:
5743 		return io_timeout_prep(req, sqe, false);
5744 	case IORING_OP_TIMEOUT_REMOVE:
5745 		return io_timeout_remove_prep(req, sqe);
5746 	case IORING_OP_ASYNC_CANCEL:
5747 		return io_async_cancel_prep(req, sqe);
5748 	case IORING_OP_LINK_TIMEOUT:
5749 		return io_timeout_prep(req, sqe, true);
5750 	case IORING_OP_ACCEPT:
5751 		return io_accept_prep(req, sqe);
5752 	case IORING_OP_FALLOCATE:
5753 		return io_fallocate_prep(req, sqe);
5754 	case IORING_OP_OPENAT:
5755 		return io_openat_prep(req, sqe);
5756 	case IORING_OP_CLOSE:
5757 		return io_close_prep(req, sqe);
5758 	case IORING_OP_FILES_UPDATE:
5759 		return io_files_update_prep(req, sqe);
5760 	case IORING_OP_STATX:
5761 		return io_statx_prep(req, sqe);
5762 	case IORING_OP_FADVISE:
5763 		return io_fadvise_prep(req, sqe);
5764 	case IORING_OP_MADVISE:
5765 		return io_madvise_prep(req, sqe);
5766 	case IORING_OP_OPENAT2:
5767 		return io_openat2_prep(req, sqe);
5768 	case IORING_OP_EPOLL_CTL:
5769 		return io_epoll_ctl_prep(req, sqe);
5770 	case IORING_OP_SPLICE:
5771 		return io_splice_prep(req, sqe);
5772 	case IORING_OP_PROVIDE_BUFFERS:
5773 		return io_provide_buffers_prep(req, sqe);
5774 	case IORING_OP_REMOVE_BUFFERS:
5775 		return io_remove_buffers_prep(req, sqe);
5776 	case IORING_OP_TEE:
5777 		return io_tee_prep(req, sqe);
5778 	}
5779 
5780 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5781 			req->opcode);
5782 	return-EINVAL;
5783 }
5784 
io_req_defer_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)5785 static int io_req_defer_prep(struct io_kiocb *req,
5786 			     const struct io_uring_sqe *sqe)
5787 {
5788 	if (!sqe)
5789 		return 0;
5790 	if (io_alloc_async_data(req))
5791 		return -EAGAIN;
5792 	return io_req_prep(req, sqe);
5793 }
5794 
io_get_sequence(struct io_kiocb * req)5795 static u32 io_get_sequence(struct io_kiocb *req)
5796 {
5797 	struct io_kiocb *pos;
5798 	struct io_ring_ctx *ctx = req->ctx;
5799 	u32 total_submitted, nr_reqs = 1;
5800 
5801 	if (req->flags & REQ_F_LINK_HEAD)
5802 		list_for_each_entry(pos, &req->link_list, link_list)
5803 			nr_reqs++;
5804 
5805 	total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5806 	return total_submitted - nr_reqs;
5807 }
5808 
io_req_defer(struct io_kiocb * req,const struct io_uring_sqe * sqe)5809 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5810 {
5811 	struct io_ring_ctx *ctx = req->ctx;
5812 	struct io_defer_entry *de;
5813 	int ret;
5814 	u32 seq;
5815 
5816 	/* Still need defer if there is pending req in defer list. */
5817 	if (likely(list_empty_careful(&ctx->defer_list) &&
5818 		!(req->flags & REQ_F_IO_DRAIN)))
5819 		return 0;
5820 
5821 	seq = io_get_sequence(req);
5822 	/* Still a chance to pass the sequence check */
5823 	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5824 		return 0;
5825 
5826 	if (!req->async_data) {
5827 		ret = io_req_defer_prep(req, sqe);
5828 		if (ret)
5829 			return ret;
5830 	}
5831 	io_prep_async_link(req);
5832 	de = kmalloc(sizeof(*de), GFP_KERNEL);
5833 	if (!de)
5834 		return -ENOMEM;
5835 
5836 	spin_lock_irq(&ctx->completion_lock);
5837 	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5838 		spin_unlock_irq(&ctx->completion_lock);
5839 		kfree(de);
5840 		io_queue_async_work(req);
5841 		return -EIOCBQUEUED;
5842 	}
5843 
5844 	trace_io_uring_defer(ctx, req, req->user_data);
5845 	de->req = req;
5846 	de->seq = seq;
5847 	list_add_tail(&de->list, &ctx->defer_list);
5848 	spin_unlock_irq(&ctx->completion_lock);
5849 	return -EIOCBQUEUED;
5850 }
5851 
io_req_drop_files(struct io_kiocb * req)5852 static void io_req_drop_files(struct io_kiocb *req)
5853 {
5854 	struct io_ring_ctx *ctx = req->ctx;
5855 	unsigned long flags;
5856 
5857 	spin_lock_irqsave(&ctx->inflight_lock, flags);
5858 	list_del(&req->inflight_entry);
5859 	if (waitqueue_active(&ctx->inflight_wait))
5860 		wake_up(&ctx->inflight_wait);
5861 	spin_unlock_irqrestore(&ctx->inflight_lock, flags);
5862 	req->flags &= ~REQ_F_INFLIGHT;
5863 	put_files_struct(req->work.identity->files);
5864 	put_nsproxy(req->work.identity->nsproxy);
5865 	req->work.flags &= ~IO_WQ_WORK_FILES;
5866 }
5867 
__io_clean_op(struct io_kiocb * req)5868 static void __io_clean_op(struct io_kiocb *req)
5869 {
5870 	if (req->flags & REQ_F_BUFFER_SELECTED) {
5871 		switch (req->opcode) {
5872 		case IORING_OP_READV:
5873 		case IORING_OP_READ_FIXED:
5874 		case IORING_OP_READ:
5875 			kfree((void *)(unsigned long)req->rw.addr);
5876 			break;
5877 		case IORING_OP_RECVMSG:
5878 		case IORING_OP_RECV:
5879 			kfree(req->sr_msg.kbuf);
5880 			break;
5881 		}
5882 		req->flags &= ~REQ_F_BUFFER_SELECTED;
5883 	}
5884 
5885 	if (req->flags & REQ_F_NEED_CLEANUP) {
5886 		switch (req->opcode) {
5887 		case IORING_OP_READV:
5888 		case IORING_OP_READ_FIXED:
5889 		case IORING_OP_READ:
5890 		case IORING_OP_WRITEV:
5891 		case IORING_OP_WRITE_FIXED:
5892 		case IORING_OP_WRITE: {
5893 			struct io_async_rw *io = req->async_data;
5894 			if (io->free_iovec)
5895 				kfree(io->free_iovec);
5896 			break;
5897 			}
5898 		case IORING_OP_RECVMSG:
5899 		case IORING_OP_SENDMSG: {
5900 			struct io_async_msghdr *io = req->async_data;
5901 			if (io->iov != io->fast_iov)
5902 				kfree(io->iov);
5903 			break;
5904 			}
5905 		case IORING_OP_SPLICE:
5906 		case IORING_OP_TEE:
5907 			io_put_file(req, req->splice.file_in,
5908 				    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5909 			break;
5910 		case IORING_OP_OPENAT:
5911 		case IORING_OP_OPENAT2:
5912 			if (req->open.filename)
5913 				putname(req->open.filename);
5914 			break;
5915 		}
5916 		req->flags &= ~REQ_F_NEED_CLEANUP;
5917 	}
5918 
5919 	if (req->flags & REQ_F_INFLIGHT)
5920 		io_req_drop_files(req);
5921 }
5922 
io_issue_sqe(struct io_kiocb * req,bool force_nonblock,struct io_comp_state * cs)5923 static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
5924 			struct io_comp_state *cs)
5925 {
5926 	struct io_ring_ctx *ctx = req->ctx;
5927 	int ret;
5928 
5929 	switch (req->opcode) {
5930 	case IORING_OP_NOP:
5931 		ret = io_nop(req, cs);
5932 		break;
5933 	case IORING_OP_READV:
5934 	case IORING_OP_READ_FIXED:
5935 	case IORING_OP_READ:
5936 		ret = io_read(req, force_nonblock, cs);
5937 		break;
5938 	case IORING_OP_WRITEV:
5939 	case IORING_OP_WRITE_FIXED:
5940 	case IORING_OP_WRITE:
5941 		ret = io_write(req, force_nonblock, cs);
5942 		break;
5943 	case IORING_OP_FSYNC:
5944 		ret = io_fsync(req, force_nonblock);
5945 		break;
5946 	case IORING_OP_POLL_ADD:
5947 		ret = io_poll_add(req);
5948 		break;
5949 	case IORING_OP_POLL_REMOVE:
5950 		ret = io_poll_remove(req);
5951 		break;
5952 	case IORING_OP_SYNC_FILE_RANGE:
5953 		ret = io_sync_file_range(req, force_nonblock);
5954 		break;
5955 	case IORING_OP_SENDMSG:
5956 		ret = io_sendmsg(req, force_nonblock, cs);
5957 		break;
5958 	case IORING_OP_SEND:
5959 		ret = io_send(req, force_nonblock, cs);
5960 		break;
5961 	case IORING_OP_RECVMSG:
5962 		ret = io_recvmsg(req, force_nonblock, cs);
5963 		break;
5964 	case IORING_OP_RECV:
5965 		ret = io_recv(req, force_nonblock, cs);
5966 		break;
5967 	case IORING_OP_TIMEOUT:
5968 		ret = io_timeout(req);
5969 		break;
5970 	case IORING_OP_TIMEOUT_REMOVE:
5971 		ret = io_timeout_remove(req);
5972 		break;
5973 	case IORING_OP_ACCEPT:
5974 		ret = io_accept(req, force_nonblock, cs);
5975 		break;
5976 	case IORING_OP_CONNECT:
5977 		ret = io_connect(req, force_nonblock, cs);
5978 		break;
5979 	case IORING_OP_ASYNC_CANCEL:
5980 		ret = io_async_cancel(req);
5981 		break;
5982 	case IORING_OP_FALLOCATE:
5983 		ret = io_fallocate(req, force_nonblock);
5984 		break;
5985 	case IORING_OP_OPENAT:
5986 		ret = io_openat(req, force_nonblock);
5987 		break;
5988 	case IORING_OP_CLOSE:
5989 		ret = io_close(req, force_nonblock, cs);
5990 		break;
5991 	case IORING_OP_FILES_UPDATE:
5992 		ret = io_files_update(req, force_nonblock, cs);
5993 		break;
5994 	case IORING_OP_STATX:
5995 		ret = io_statx(req, force_nonblock);
5996 		break;
5997 	case IORING_OP_FADVISE:
5998 		ret = io_fadvise(req, force_nonblock);
5999 		break;
6000 	case IORING_OP_MADVISE:
6001 		ret = io_madvise(req, force_nonblock);
6002 		break;
6003 	case IORING_OP_OPENAT2:
6004 		ret = io_openat2(req, force_nonblock);
6005 		break;
6006 	case IORING_OP_EPOLL_CTL:
6007 		ret = io_epoll_ctl(req, force_nonblock, cs);
6008 		break;
6009 	case IORING_OP_SPLICE:
6010 		ret = io_splice(req, force_nonblock);
6011 		break;
6012 	case IORING_OP_PROVIDE_BUFFERS:
6013 		ret = io_provide_buffers(req, force_nonblock, cs);
6014 		break;
6015 	case IORING_OP_REMOVE_BUFFERS:
6016 		ret = io_remove_buffers(req, force_nonblock, cs);
6017 		break;
6018 	case IORING_OP_TEE:
6019 		ret = io_tee(req, force_nonblock);
6020 		break;
6021 	default:
6022 		ret = -EINVAL;
6023 		break;
6024 	}
6025 
6026 	if (ret)
6027 		return ret;
6028 
6029 	/* If the op doesn't have a file, we're not polling for it */
6030 	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6031 		const bool in_async = io_wq_current_is_worker();
6032 
6033 		/* workqueue context doesn't hold uring_lock, grab it now */
6034 		if (in_async)
6035 			mutex_lock(&ctx->uring_lock);
6036 
6037 		io_iopoll_req_issued(req);
6038 
6039 		if (in_async)
6040 			mutex_unlock(&ctx->uring_lock);
6041 	}
6042 
6043 	return 0;
6044 }
6045 
io_wq_submit_work(struct io_wq_work * work)6046 static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
6047 {
6048 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6049 	struct io_kiocb *timeout;
6050 	int ret = 0;
6051 
6052 	timeout = io_prep_linked_timeout(req);
6053 	if (timeout)
6054 		io_queue_linked_timeout(timeout);
6055 
6056 	/* if NO_CANCEL is set, we must still run the work */
6057 	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
6058 				IO_WQ_WORK_CANCEL) {
6059 		ret = -ECANCELED;
6060 	}
6061 
6062 	if (!ret) {
6063 		do {
6064 			ret = io_issue_sqe(req, false, NULL);
6065 			/*
6066 			 * We can get EAGAIN for polled IO even though we're
6067 			 * forcing a sync submission from here, since we can't
6068 			 * wait for request slots on the block side.
6069 			 */
6070 			if (ret != -EAGAIN)
6071 				break;
6072 			cond_resched();
6073 		} while (1);
6074 	}
6075 
6076 	if (ret) {
6077 		req_set_fail_links(req);
6078 		io_req_complete(req, ret);
6079 	}
6080 
6081 	return io_steal_work(req);
6082 }
6083 
io_file_from_index(struct io_ring_ctx * ctx,int index)6084 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6085 					      int index)
6086 {
6087 	struct fixed_file_table *table;
6088 
6089 	table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6090 	return table->files[index & IORING_FILE_TABLE_MASK];
6091 }
6092 
io_file_get(struct io_submit_state * state,struct io_kiocb * req,int fd,bool fixed)6093 static struct file *io_file_get(struct io_submit_state *state,
6094 				struct io_kiocb *req, int fd, bool fixed)
6095 {
6096 	struct io_ring_ctx *ctx = req->ctx;
6097 	struct file *file;
6098 
6099 	if (fixed) {
6100 		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6101 			return NULL;
6102 		fd = array_index_nospec(fd, ctx->nr_user_files);
6103 		file = io_file_from_index(ctx, fd);
6104 		if (file) {
6105 			req->fixed_file_refs = &ctx->file_data->node->refs;
6106 			percpu_ref_get(req->fixed_file_refs);
6107 		}
6108 	} else {
6109 		trace_io_uring_file_get(ctx, fd);
6110 		file = __io_file_get(state, fd);
6111 	}
6112 
6113 	return file;
6114 }
6115 
io_req_set_file(struct io_submit_state * state,struct io_kiocb * req,int fd)6116 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
6117 			   int fd)
6118 {
6119 	bool fixed;
6120 
6121 	fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
6122 	if (unlikely(!fixed && io_async_submit(req->ctx)))
6123 		return -EBADF;
6124 
6125 	req->file = io_file_get(state, req, fd, fixed);
6126 	if (req->file || io_op_defs[req->opcode].needs_file_no_error)
6127 		return 0;
6128 	return -EBADF;
6129 }
6130 
io_link_timeout_fn(struct hrtimer * timer)6131 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6132 {
6133 	struct io_timeout_data *data = container_of(timer,
6134 						struct io_timeout_data, timer);
6135 	struct io_kiocb *req = data->req;
6136 	struct io_ring_ctx *ctx = req->ctx;
6137 	struct io_kiocb *prev = NULL;
6138 	unsigned long flags;
6139 
6140 	spin_lock_irqsave(&ctx->completion_lock, flags);
6141 
6142 	/*
6143 	 * We don't expect the list to be empty, that will only happen if we
6144 	 * race with the completion of the linked work.
6145 	 */
6146 	if (!list_empty(&req->link_list)) {
6147 		prev = list_entry(req->link_list.prev, struct io_kiocb,
6148 				  link_list);
6149 		if (refcount_inc_not_zero(&prev->refs))
6150 			list_del_init(&req->link_list);
6151 		else
6152 			prev = NULL;
6153 	}
6154 
6155 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
6156 
6157 	if (prev) {
6158 		req_set_fail_links(prev);
6159 		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6160 		io_put_req(prev);
6161 	} else {
6162 		io_req_complete(req, -ETIME);
6163 	}
6164 	return HRTIMER_NORESTART;
6165 }
6166 
__io_queue_linked_timeout(struct io_kiocb * req)6167 static void __io_queue_linked_timeout(struct io_kiocb *req)
6168 {
6169 	/*
6170 	 * If the list is now empty, then our linked request finished before
6171 	 * we got a chance to setup the timer
6172 	 */
6173 	if (!list_empty(&req->link_list)) {
6174 		struct io_timeout_data *data = req->async_data;
6175 
6176 		data->timer.function = io_link_timeout_fn;
6177 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6178 				data->mode);
6179 	}
6180 }
6181 
io_queue_linked_timeout(struct io_kiocb * req)6182 static void io_queue_linked_timeout(struct io_kiocb *req)
6183 {
6184 	struct io_ring_ctx *ctx = req->ctx;
6185 
6186 	spin_lock_irq(&ctx->completion_lock);
6187 	__io_queue_linked_timeout(req);
6188 	spin_unlock_irq(&ctx->completion_lock);
6189 
6190 	/* drop submission reference */
6191 	io_put_req(req);
6192 }
6193 
io_prep_linked_timeout(struct io_kiocb * req)6194 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6195 {
6196 	struct io_kiocb *nxt;
6197 
6198 	if (!(req->flags & REQ_F_LINK_HEAD))
6199 		return NULL;
6200 	if (req->flags & REQ_F_LINK_TIMEOUT)
6201 		return NULL;
6202 
6203 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
6204 					link_list);
6205 	if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
6206 		return NULL;
6207 
6208 	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6209 	req->flags |= REQ_F_LINK_TIMEOUT;
6210 	return nxt;
6211 }
6212 
__io_queue_sqe(struct io_kiocb * req,struct io_comp_state * cs)6213 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
6214 {
6215 	struct io_kiocb *linked_timeout;
6216 	const struct cred *old_creds = NULL;
6217 	int ret;
6218 
6219 again:
6220 	linked_timeout = io_prep_linked_timeout(req);
6221 
6222 	if ((req->flags & REQ_F_WORK_INITIALIZED) &&
6223 	    (req->work.flags & IO_WQ_WORK_CREDS) &&
6224 	    req->work.identity->creds != current_cred()) {
6225 		if (old_creds)
6226 			revert_creds(old_creds);
6227 		if (old_creds == req->work.identity->creds)
6228 			old_creds = NULL; /* restored original creds */
6229 		else
6230 			old_creds = override_creds(req->work.identity->creds);
6231 	}
6232 
6233 	ret = io_issue_sqe(req, true, cs);
6234 
6235 	/*
6236 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
6237 	 * doesn't support non-blocking read/write attempts
6238 	 */
6239 	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6240 		if (!io_arm_poll_handler(req)) {
6241 			/*
6242 			 * Queued up for async execution, worker will release
6243 			 * submit reference when the iocb is actually submitted.
6244 			 */
6245 			io_queue_async_work(req);
6246 		}
6247 
6248 		if (linked_timeout)
6249 			io_queue_linked_timeout(linked_timeout);
6250 	} else if (likely(!ret)) {
6251 		/* drop submission reference */
6252 		req = io_put_req_find_next(req);
6253 		if (linked_timeout)
6254 			io_queue_linked_timeout(linked_timeout);
6255 
6256 		if (req) {
6257 			if (!(req->flags & REQ_F_FORCE_ASYNC))
6258 				goto again;
6259 			io_queue_async_work(req);
6260 		}
6261 	} else {
6262 		/* un-prep timeout, so it'll be killed as any other linked */
6263 		req->flags &= ~REQ_F_LINK_TIMEOUT;
6264 		req_set_fail_links(req);
6265 		io_put_req(req);
6266 		io_req_complete(req, ret);
6267 	}
6268 
6269 	if (old_creds)
6270 		revert_creds(old_creds);
6271 }
6272 
io_queue_sqe(struct io_kiocb * req,const struct io_uring_sqe * sqe,struct io_comp_state * cs)6273 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6274 			 struct io_comp_state *cs)
6275 {
6276 	int ret;
6277 
6278 	ret = io_req_defer(req, sqe);
6279 	if (ret) {
6280 		if (ret != -EIOCBQUEUED) {
6281 fail_req:
6282 			req_set_fail_links(req);
6283 			io_put_req(req);
6284 			io_req_complete(req, ret);
6285 		}
6286 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
6287 		if (!req->async_data) {
6288 			ret = io_req_defer_prep(req, sqe);
6289 			if (unlikely(ret))
6290 				goto fail_req;
6291 		}
6292 		io_queue_async_work(req);
6293 	} else {
6294 		if (sqe) {
6295 			ret = io_req_prep(req, sqe);
6296 			if (unlikely(ret))
6297 				goto fail_req;
6298 		}
6299 		__io_queue_sqe(req, cs);
6300 	}
6301 }
6302 
io_queue_link_head(struct io_kiocb * req,struct io_comp_state * cs)6303 static inline void io_queue_link_head(struct io_kiocb *req,
6304 				      struct io_comp_state *cs)
6305 {
6306 	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
6307 		io_put_req(req);
6308 		io_req_complete(req, -ECANCELED);
6309 	} else
6310 		io_queue_sqe(req, NULL, cs);
6311 }
6312 
io_submit_sqe(struct io_kiocb * req,const struct io_uring_sqe * sqe,struct io_kiocb ** link,struct io_comp_state * cs)6313 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6314 			 struct io_kiocb **link, struct io_comp_state *cs)
6315 {
6316 	struct io_ring_ctx *ctx = req->ctx;
6317 	int ret;
6318 
6319 	/*
6320 	 * If we already have a head request, queue this one for async
6321 	 * submittal once the head completes. If we don't have a head but
6322 	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6323 	 * submitted sync once the chain is complete. If none of those
6324 	 * conditions are true (normal request), then just queue it.
6325 	 */
6326 	if (*link) {
6327 		struct io_kiocb *head = *link;
6328 
6329 		/*
6330 		 * Taking sequential execution of a link, draining both sides
6331 		 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6332 		 * requests in the link. So, it drains the head and the
6333 		 * next after the link request. The last one is done via
6334 		 * drain_next flag to persist the effect across calls.
6335 		 */
6336 		if (req->flags & REQ_F_IO_DRAIN) {
6337 			head->flags |= REQ_F_IO_DRAIN;
6338 			ctx->drain_next = 1;
6339 		}
6340 		ret = io_req_defer_prep(req, sqe);
6341 		if (unlikely(ret)) {
6342 			/* fail even hard links since we don't submit */
6343 			head->flags |= REQ_F_FAIL_LINK;
6344 			return ret;
6345 		}
6346 		trace_io_uring_link(ctx, req, head);
6347 		list_add_tail(&req->link_list, &head->link_list);
6348 
6349 		/* last request of a link, enqueue the link */
6350 		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6351 			io_queue_link_head(head, cs);
6352 			*link = NULL;
6353 		}
6354 	} else {
6355 		if (unlikely(ctx->drain_next)) {
6356 			req->flags |= REQ_F_IO_DRAIN;
6357 			ctx->drain_next = 0;
6358 		}
6359 		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6360 			req->flags |= REQ_F_LINK_HEAD;
6361 			INIT_LIST_HEAD(&req->link_list);
6362 
6363 			ret = io_req_defer_prep(req, sqe);
6364 			if (unlikely(ret))
6365 				req->flags |= REQ_F_FAIL_LINK;
6366 			*link = req;
6367 		} else {
6368 			io_queue_sqe(req, sqe, cs);
6369 		}
6370 	}
6371 
6372 	return 0;
6373 }
6374 
6375 /*
6376  * Batched submission is done, ensure local IO is flushed out.
6377  */
io_submit_state_end(struct io_submit_state * state)6378 static void io_submit_state_end(struct io_submit_state *state)
6379 {
6380 	if (!list_empty(&state->comp.list))
6381 		io_submit_flush_completions(&state->comp);
6382 	blk_finish_plug(&state->plug);
6383 	io_state_file_put(state);
6384 	if (state->free_reqs)
6385 		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
6386 }
6387 
6388 /*
6389  * Start submission side cache.
6390  */
io_submit_state_start(struct io_submit_state * state,struct io_ring_ctx * ctx,unsigned int max_ios)6391 static void io_submit_state_start(struct io_submit_state *state,
6392 				  struct io_ring_ctx *ctx, unsigned int max_ios)
6393 {
6394 	blk_start_plug(&state->plug);
6395 	state->comp.nr = 0;
6396 	INIT_LIST_HEAD(&state->comp.list);
6397 	state->comp.ctx = ctx;
6398 	state->free_reqs = 0;
6399 	state->file = NULL;
6400 	state->ios_left = max_ios;
6401 }
6402 
io_commit_sqring(struct io_ring_ctx * ctx)6403 static void io_commit_sqring(struct io_ring_ctx *ctx)
6404 {
6405 	struct io_rings *rings = ctx->rings;
6406 
6407 	/*
6408 	 * Ensure any loads from the SQEs are done at this point,
6409 	 * since once we write the new head, the application could
6410 	 * write new data to them.
6411 	 */
6412 	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6413 }
6414 
6415 /*
6416  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6417  * that is mapped by userspace. This means that care needs to be taken to
6418  * ensure that reads are stable, as we cannot rely on userspace always
6419  * being a good citizen. If members of the sqe are validated and then later
6420  * used, it's important that those reads are done through READ_ONCE() to
6421  * prevent a re-load down the line.
6422  */
io_get_sqe(struct io_ring_ctx * ctx)6423 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6424 {
6425 	u32 *sq_array = ctx->sq_array;
6426 	unsigned head;
6427 
6428 	/*
6429 	 * The cached sq head (or cq tail) serves two purposes:
6430 	 *
6431 	 * 1) allows us to batch the cost of updating the user visible
6432 	 *    head updates.
6433 	 * 2) allows the kernel side to track the head on its own, even
6434 	 *    though the application is the one updating it.
6435 	 */
6436 	head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
6437 	if (likely(head < ctx->sq_entries))
6438 		return &ctx->sq_sqes[head];
6439 
6440 	/* drop invalid entries */
6441 	ctx->cached_sq_dropped++;
6442 	WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6443 	return NULL;
6444 }
6445 
io_consume_sqe(struct io_ring_ctx * ctx)6446 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
6447 {
6448 	ctx->cached_sq_head++;
6449 }
6450 
6451 /*
6452  * Check SQE restrictions (opcode and flags).
6453  *
6454  * Returns 'true' if SQE is allowed, 'false' otherwise.
6455  */
io_check_restriction(struct io_ring_ctx * ctx,struct io_kiocb * req,unsigned int sqe_flags)6456 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6457 					struct io_kiocb *req,
6458 					unsigned int sqe_flags)
6459 {
6460 	if (!ctx->restricted)
6461 		return true;
6462 
6463 	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6464 		return false;
6465 
6466 	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6467 	    ctx->restrictions.sqe_flags_required)
6468 		return false;
6469 
6470 	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6471 			  ctx->restrictions.sqe_flags_required))
6472 		return false;
6473 
6474 	return true;
6475 }
6476 
6477 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
6478 				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
6479 				IOSQE_BUFFER_SELECT)
6480 
io_init_req(struct io_ring_ctx * ctx,struct io_kiocb * req,const struct io_uring_sqe * sqe,struct io_submit_state * state)6481 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6482 		       const struct io_uring_sqe *sqe,
6483 		       struct io_submit_state *state)
6484 {
6485 	unsigned int sqe_flags;
6486 	int id, ret;
6487 
6488 	req->opcode = READ_ONCE(sqe->opcode);
6489 	req->user_data = READ_ONCE(sqe->user_data);
6490 	req->async_data = NULL;
6491 	req->file = NULL;
6492 	req->ctx = ctx;
6493 	req->flags = 0;
6494 	/* one is dropped after submission, the other at completion */
6495 	refcount_set(&req->refs, 2);
6496 	req->task = current;
6497 	req->result = 0;
6498 
6499 	if (unlikely(req->opcode >= IORING_OP_LAST))
6500 		return -EINVAL;
6501 
6502 	if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
6503 		return -EFAULT;
6504 
6505 	sqe_flags = READ_ONCE(sqe->flags);
6506 	/* enforce forwards compatibility on users */
6507 	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6508 		return -EINVAL;
6509 
6510 	if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6511 		return -EACCES;
6512 
6513 	if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6514 	    !io_op_defs[req->opcode].buffer_select)
6515 		return -EOPNOTSUPP;
6516 
6517 	id = READ_ONCE(sqe->personality);
6518 	if (id) {
6519 		struct io_identity *iod;
6520 
6521 		iod = idr_find(&ctx->personality_idr, id);
6522 		if (unlikely(!iod))
6523 			return -EINVAL;
6524 		refcount_inc(&iod->count);
6525 
6526 		__io_req_init_async(req);
6527 		get_cred(iod->creds);
6528 		req->work.identity = iod;
6529 		req->work.flags |= IO_WQ_WORK_CREDS;
6530 	}
6531 
6532 	/* same numerical values with corresponding REQ_F_*, safe to copy */
6533 	req->flags |= sqe_flags;
6534 
6535 	if (!io_op_defs[req->opcode].needs_file)
6536 		return 0;
6537 
6538 	ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
6539 	state->ios_left--;
6540 	return ret;
6541 }
6542 
io_submit_sqes(struct io_ring_ctx * ctx,unsigned int nr)6543 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6544 {
6545 	struct io_submit_state state;
6546 	struct io_kiocb *link = NULL;
6547 	int i, submitted = 0;
6548 
6549 	/* if we have a backlog and couldn't flush it all, return BUSY */
6550 	if (test_bit(0, &ctx->sq_check_overflow)) {
6551 		if (!list_empty(&ctx->cq_overflow_list) &&
6552 		    !io_cqring_overflow_flush(ctx, false, NULL, NULL))
6553 			return -EBUSY;
6554 	}
6555 
6556 	/* make sure SQ entry isn't read before tail */
6557 	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6558 
6559 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
6560 		return -EAGAIN;
6561 
6562 	percpu_counter_add(¤t->io_uring->inflight, nr);
6563 	refcount_add(nr, ¤t->usage);
6564 
6565 	io_submit_state_start(&state, ctx, nr);
6566 
6567 	for (i = 0; i < nr; i++) {
6568 		const struct io_uring_sqe *sqe;
6569 		struct io_kiocb *req;
6570 		int err;
6571 
6572 		sqe = io_get_sqe(ctx);
6573 		if (unlikely(!sqe)) {
6574 			io_consume_sqe(ctx);
6575 			break;
6576 		}
6577 		req = io_alloc_req(ctx, &state);
6578 		if (unlikely(!req)) {
6579 			if (!submitted)
6580 				submitted = -EAGAIN;
6581 			break;
6582 		}
6583 		io_consume_sqe(ctx);
6584 		/* will complete beyond this point, count as submitted */
6585 		submitted++;
6586 
6587 		err = io_init_req(ctx, req, sqe, &state);
6588 		if (unlikely(err)) {
6589 fail_req:
6590 			io_put_req(req);
6591 			io_req_complete(req, err);
6592 			break;
6593 		}
6594 
6595 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6596 						true, io_async_submit(ctx));
6597 		err = io_submit_sqe(req, sqe, &link, &state.comp);
6598 		if (err)
6599 			goto fail_req;
6600 	}
6601 
6602 	if (unlikely(submitted != nr)) {
6603 		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6604 		struct io_uring_task *tctx = current->io_uring;
6605 		int unused = nr - ref_used;
6606 
6607 		percpu_ref_put_many(&ctx->refs, unused);
6608 		percpu_counter_sub(&tctx->inflight, unused);
6609 		put_task_struct_many(current, unused);
6610 	}
6611 	if (link)
6612 		io_queue_link_head(link, &state.comp);
6613 	io_submit_state_end(&state);
6614 
6615 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6616 	io_commit_sqring(ctx);
6617 
6618 	return submitted;
6619 }
6620 
io_ring_set_wakeup_flag(struct io_ring_ctx * ctx)6621 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6622 {
6623 	/* Tell userspace we may need a wakeup call */
6624 	spin_lock_irq(&ctx->completion_lock);
6625 	ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6626 	spin_unlock_irq(&ctx->completion_lock);
6627 }
6628 
io_ring_clear_wakeup_flag(struct io_ring_ctx * ctx)6629 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6630 {
6631 	spin_lock_irq(&ctx->completion_lock);
6632 	ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6633 	spin_unlock_irq(&ctx->completion_lock);
6634 }
6635 
io_sq_wake_function(struct wait_queue_entry * wqe,unsigned mode,int sync,void * key)6636 static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
6637 			       int sync, void *key)
6638 {
6639 	struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
6640 	int ret;
6641 
6642 	ret = autoremove_wake_function(wqe, mode, sync, key);
6643 	if (ret) {
6644 		unsigned long flags;
6645 
6646 		spin_lock_irqsave(&ctx->completion_lock, flags);
6647 		ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6648 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
6649 	}
6650 	return ret;
6651 }
6652 
6653 enum sq_ret {
6654 	SQT_IDLE	= 1,
6655 	SQT_SPIN	= 2,
6656 	SQT_DID_WORK	= 4,
6657 };
6658 
__io_sq_thread(struct io_ring_ctx * ctx,unsigned long start_jiffies,bool cap_entries)6659 static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
6660 				  unsigned long start_jiffies, bool cap_entries)
6661 {
6662 	unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
6663 	struct io_sq_data *sqd = ctx->sq_data;
6664 	unsigned int to_submit;
6665 	int ret = 0;
6666 
6667 again:
6668 	if (!list_empty(&ctx->iopoll_list)) {
6669 		unsigned nr_events = 0;
6670 
6671 		mutex_lock(&ctx->uring_lock);
6672 		if (!list_empty(&ctx->iopoll_list) && !need_resched())
6673 			io_do_iopoll(ctx, &nr_events, 0);
6674 		mutex_unlock(&ctx->uring_lock);
6675 	}
6676 
6677 	to_submit = io_sqring_entries(ctx);
6678 
6679 	/*
6680 	 * If submit got -EBUSY, flag us as needing the application
6681 	 * to enter the kernel to reap and flush events.
6682 	 */
6683 	if (!to_submit || ret == -EBUSY || need_resched()) {
6684 		/*
6685 		 * Drop cur_mm before scheduling, we can't hold it for
6686 		 * long periods (or over schedule()). Do this before
6687 		 * adding ourselves to the waitqueue, as the unuse/drop
6688 		 * may sleep.
6689 		 */
6690 		io_sq_thread_drop_mm();
6691 
6692 		/*
6693 		 * We're polling. If we're within the defined idle
6694 		 * period, then let us spin without work before going
6695 		 * to sleep. The exception is if we got EBUSY doing
6696 		 * more IO, we should wait for the application to
6697 		 * reap events and wake us up.
6698 		 */
6699 		if (!list_empty(&ctx->iopoll_list) || need_resched() ||
6700 		    (!time_after(jiffies, timeout) && ret != -EBUSY &&
6701 		    !percpu_ref_is_dying(&ctx->refs)))
6702 			return SQT_SPIN;
6703 
6704 		prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
6705 					TASK_INTERRUPTIBLE);
6706 
6707 		/*
6708 		 * While doing polled IO, before going to sleep, we need
6709 		 * to check if there are new reqs added to iopoll_list,
6710 		 * it is because reqs may have been punted to io worker
6711 		 * and will be added to iopoll_list later, hence check
6712 		 * the iopoll_list again.
6713 		 */
6714 		if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6715 		    !list_empty_careful(&ctx->iopoll_list)) {
6716 			finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
6717 			goto again;
6718 		}
6719 
6720 		to_submit = io_sqring_entries(ctx);
6721 		if (!to_submit || ret == -EBUSY)
6722 			return SQT_IDLE;
6723 	}
6724 
6725 	finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
6726 	io_ring_clear_wakeup_flag(ctx);
6727 
6728 	/* if we're handling multiple rings, cap submit size for fairness */
6729 	if (cap_entries && to_submit > 8)
6730 		to_submit = 8;
6731 
6732 	mutex_lock(&ctx->uring_lock);
6733 	if (likely(!percpu_ref_is_dying(&ctx->refs)))
6734 		ret = io_submit_sqes(ctx, to_submit);
6735 	mutex_unlock(&ctx->uring_lock);
6736 
6737 	if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6738 		wake_up(&ctx->sqo_sq_wait);
6739 
6740 	return SQT_DID_WORK;
6741 }
6742 
io_sqd_init_new(struct io_sq_data * sqd)6743 static void io_sqd_init_new(struct io_sq_data *sqd)
6744 {
6745 	struct io_ring_ctx *ctx;
6746 
6747 	while (!list_empty(&sqd->ctx_new_list)) {
6748 		ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6749 		init_wait(&ctx->sqo_wait_entry);
6750 		ctx->sqo_wait_entry.func = io_sq_wake_function;
6751 		list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6752 		complete(&ctx->sq_thread_comp);
6753 	}
6754 }
6755 
io_sq_thread(void * data)6756 static int io_sq_thread(void *data)
6757 {
6758 	struct cgroup_subsys_state *cur_css = NULL;
6759 	const struct cred *old_cred = NULL;
6760 	struct io_sq_data *sqd = data;
6761 	struct io_ring_ctx *ctx;
6762 	unsigned long start_jiffies;
6763 
6764 	start_jiffies = jiffies;
6765 	while (!kthread_should_stop()) {
6766 		enum sq_ret ret = 0;
6767 		bool cap_entries;
6768 
6769 		/*
6770 		 * Any changes to the sqd lists are synchronized through the
6771 		 * kthread parking. This synchronizes the thread vs users,
6772 		 * the users are synchronized on the sqd->ctx_lock.
6773 		 */
6774 		if (kthread_should_park())
6775 			kthread_parkme();
6776 
6777 		if (unlikely(!list_empty(&sqd->ctx_new_list)))
6778 			io_sqd_init_new(sqd);
6779 
6780 		cap_entries = !list_is_singular(&sqd->ctx_list);
6781 
6782 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6783 			if (current->cred != ctx->creds) {
6784 				if (old_cred)
6785 					revert_creds(old_cred);
6786 				old_cred = override_creds(ctx->creds);
6787 			}
6788 			io_sq_thread_associate_blkcg(ctx, &cur_css);
6789 #ifdef CONFIG_AUDIT
6790 			current->loginuid = ctx->loginuid;
6791 			current->sessionid = ctx->sessionid;
6792 #endif
6793 
6794 			ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
6795 
6796 			io_sq_thread_drop_mm();
6797 		}
6798 
6799 		if (ret & SQT_SPIN) {
6800 			io_run_task_work();
6801 			cond_resched();
6802 		} else if (ret == SQT_IDLE) {
6803 			if (kthread_should_park())
6804 				continue;
6805 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6806 				io_ring_set_wakeup_flag(ctx);
6807 			schedule();
6808 			start_jiffies = jiffies;
6809 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6810 				io_ring_clear_wakeup_flag(ctx);
6811 		}
6812 	}
6813 
6814 	io_run_task_work();
6815 
6816 	if (cur_css)
6817 		io_sq_thread_unassociate_blkcg();
6818 	if (old_cred)
6819 		revert_creds(old_cred);
6820 
6821 	kthread_parkme();
6822 
6823 	return 0;
6824 }
6825 
6826 struct io_wait_queue {
6827 	struct wait_queue_entry wq;
6828 	struct io_ring_ctx *ctx;
6829 	unsigned to_wait;
6830 	unsigned nr_timeouts;
6831 };
6832 
io_should_wake(struct io_wait_queue * iowq,bool noflush)6833 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6834 {
6835 	struct io_ring_ctx *ctx = iowq->ctx;
6836 
6837 	/*
6838 	 * Wake up if we have enough events, or if a timeout occurred since we
6839 	 * started waiting. For timeouts, we always want to return to userspace,
6840 	 * regardless of event count.
6841 	 */
6842 	return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6843 			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6844 }
6845 
io_wake_function(struct wait_queue_entry * curr,unsigned int mode,int wake_flags,void * key)6846 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6847 			    int wake_flags, void *key)
6848 {
6849 	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6850 							wq);
6851 
6852 	/* use noflush == true, as we can't safely rely on locking context */
6853 	if (!io_should_wake(iowq, true))
6854 		return -1;
6855 
6856 	return autoremove_wake_function(curr, mode, wake_flags, key);
6857 }
6858 
io_run_task_work_sig(void)6859 static int io_run_task_work_sig(void)
6860 {
6861 	if (io_run_task_work())
6862 		return 1;
6863 	if (!signal_pending(current))
6864 		return 0;
6865 	if (current->jobctl & JOBCTL_TASK_WORK) {
6866 		spin_lock_irq(¤t->sighand->siglock);
6867 		current->jobctl &= ~JOBCTL_TASK_WORK;
6868 		recalc_sigpending();
6869 		spin_unlock_irq(¤t->sighand->siglock);
6870 		return 1;
6871 	}
6872 	return -EINTR;
6873 }
6874 
6875 /*
6876  * Wait until events become available, if we don't already have some. The
6877  * application must reap them itself, as they reside on the shared cq ring.
6878  */
io_cqring_wait(struct io_ring_ctx * ctx,int min_events,const sigset_t __user * sig,size_t sigsz)6879 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6880 			  const sigset_t __user *sig, size_t sigsz)
6881 {
6882 	struct io_wait_queue iowq = {
6883 		.wq = {
6884 			.private	= current,
6885 			.func		= io_wake_function,
6886 			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
6887 		},
6888 		.ctx		= ctx,
6889 		.to_wait	= min_events,
6890 	};
6891 	struct io_rings *rings = ctx->rings;
6892 	int ret = 0;
6893 
6894 	do {
6895 		if (io_cqring_events(ctx, false) >= min_events)
6896 			return 0;
6897 		if (!io_run_task_work())
6898 			break;
6899 	} while (1);
6900 
6901 	if (sig) {
6902 #ifdef CONFIG_COMPAT
6903 		if (in_compat_syscall())
6904 			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6905 						      sigsz);
6906 		else
6907 #endif
6908 			ret = set_user_sigmask(sig, sigsz);
6909 
6910 		if (ret)
6911 			return ret;
6912 	}
6913 
6914 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6915 	trace_io_uring_cqring_wait(ctx, min_events);
6916 	do {
6917 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6918 						TASK_INTERRUPTIBLE);
6919 		/* make sure we run task_work before checking for signals */
6920 		ret = io_run_task_work_sig();
6921 		if (ret > 0)
6922 			continue;
6923 		else if (ret < 0)
6924 			break;
6925 		if (io_should_wake(&iowq, false))
6926 			break;
6927 		schedule();
6928 	} while (1);
6929 	finish_wait(&ctx->wait, &iowq.wq);
6930 
6931 	restore_saved_sigmask_unless(ret == -EINTR);
6932 
6933 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6934 }
6935 
__io_sqe_files_unregister(struct io_ring_ctx * ctx)6936 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6937 {
6938 #if defined(CONFIG_UNIX)
6939 	if (ctx->ring_sock) {
6940 		struct sock *sock = ctx->ring_sock->sk;
6941 		struct sk_buff *skb;
6942 
6943 		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6944 			kfree_skb(skb);
6945 	}
6946 #else
6947 	int i;
6948 
6949 	for (i = 0; i < ctx->nr_user_files; i++) {
6950 		struct file *file;
6951 
6952 		file = io_file_from_index(ctx, i);
6953 		if (file)
6954 			fput(file);
6955 	}
6956 #endif
6957 }
6958 
io_file_ref_kill(struct percpu_ref * ref)6959 static void io_file_ref_kill(struct percpu_ref *ref)
6960 {
6961 	struct fixed_file_data *data;
6962 
6963 	data = container_of(ref, struct fixed_file_data, refs);
6964 	complete(&data->done);
6965 }
6966 
io_sqe_files_unregister(struct io_ring_ctx * ctx)6967 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6968 {
6969 	struct fixed_file_data *data = ctx->file_data;
6970 	struct fixed_file_ref_node *ref_node = NULL;
6971 	unsigned nr_tables, i;
6972 
6973 	if (!data)
6974 		return -ENXIO;
6975 
6976 	spin_lock(&data->lock);
6977 	ref_node = data->node;
6978 	spin_unlock(&data->lock);
6979 	if (ref_node)
6980 		percpu_ref_kill(&ref_node->refs);
6981 
6982 	percpu_ref_kill(&data->refs);
6983 
6984 	/* wait for all refs nodes to complete */
6985 	flush_delayed_work(&ctx->file_put_work);
6986 	wait_for_completion(&data->done);
6987 
6988 	__io_sqe_files_unregister(ctx);
6989 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6990 	for (i = 0; i < nr_tables; i++)
6991 		kfree(data->table[i].files);
6992 	kfree(data->table);
6993 	percpu_ref_exit(&data->refs);
6994 	kfree(data);
6995 	ctx->file_data = NULL;
6996 	ctx->nr_user_files = 0;
6997 	return 0;
6998 }
6999 
io_put_sq_data(struct io_sq_data * sqd)7000 static void io_put_sq_data(struct io_sq_data *sqd)
7001 {
7002 	if (refcount_dec_and_test(&sqd->refs)) {
7003 		/*
7004 		 * The park is a bit of a work-around, without it we get
7005 		 * warning spews on shutdown with SQPOLL set and affinity
7006 		 * set to a single CPU.
7007 		 */
7008 		if (sqd->thread) {
7009 			kthread_park(sqd->thread);
7010 			kthread_stop(sqd->thread);
7011 		}
7012 
7013 		kfree(sqd);
7014 	}
7015 }
7016 
io_attach_sq_data(struct io_uring_params * p)7017 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7018 {
7019 	struct io_ring_ctx *ctx_attach;
7020 	struct io_sq_data *sqd;
7021 	struct fd f;
7022 
7023 	f = fdget(p->wq_fd);
7024 	if (!f.file)
7025 		return ERR_PTR(-ENXIO);
7026 	if (f.file->f_op != &io_uring_fops) {
7027 		fdput(f);
7028 		return ERR_PTR(-EINVAL);
7029 	}
7030 
7031 	ctx_attach = f.file->private_data;
7032 	sqd = ctx_attach->sq_data;
7033 	if (!sqd) {
7034 		fdput(f);
7035 		return ERR_PTR(-EINVAL);
7036 	}
7037 
7038 	refcount_inc(&sqd->refs);
7039 	fdput(f);
7040 	return sqd;
7041 }
7042 
io_get_sq_data(struct io_uring_params * p)7043 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7044 {
7045 	struct io_sq_data *sqd;
7046 
7047 	if (p->flags & IORING_SETUP_ATTACH_WQ)
7048 		return io_attach_sq_data(p);
7049 
7050 	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7051 	if (!sqd)
7052 		return ERR_PTR(-ENOMEM);
7053 
7054 	refcount_set(&sqd->refs, 1);
7055 	INIT_LIST_HEAD(&sqd->ctx_list);
7056 	INIT_LIST_HEAD(&sqd->ctx_new_list);
7057 	mutex_init(&sqd->ctx_lock);
7058 	mutex_init(&sqd->lock);
7059 	init_waitqueue_head(&sqd->wait);
7060 	return sqd;
7061 }
7062 
io_sq_thread_unpark(struct io_sq_data * sqd)7063 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7064 	__releases(&sqd->lock)
7065 {
7066 	if (!sqd->thread)
7067 		return;
7068 	kthread_unpark(sqd->thread);
7069 	mutex_unlock(&sqd->lock);
7070 }
7071 
io_sq_thread_park(struct io_sq_data * sqd)7072 static void io_sq_thread_park(struct io_sq_data *sqd)
7073 	__acquires(&sqd->lock)
7074 {
7075 	if (!sqd->thread)
7076 		return;
7077 	mutex_lock(&sqd->lock);
7078 	kthread_park(sqd->thread);
7079 }
7080 
io_sq_thread_stop(struct io_ring_ctx * ctx)7081 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
7082 {
7083 	struct io_sq_data *sqd = ctx->sq_data;
7084 
7085 	if (sqd) {
7086 		if (sqd->thread) {
7087 			/*
7088 			 * We may arrive here from the error branch in
7089 			 * io_sq_offload_create() where the kthread is created
7090 			 * without being waked up, thus wake it up now to make
7091 			 * sure the wait will complete.
7092 			 */
7093 			wake_up_process(sqd->thread);
7094 			wait_for_completion(&ctx->sq_thread_comp);
7095 
7096 			io_sq_thread_park(sqd);
7097 		}
7098 
7099 		mutex_lock(&sqd->ctx_lock);
7100 		list_del(&ctx->sqd_list);
7101 		mutex_unlock(&sqd->ctx_lock);
7102 
7103 		if (sqd->thread) {
7104 			finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
7105 			io_sq_thread_unpark(sqd);
7106 		}
7107 
7108 		io_put_sq_data(sqd);
7109 		ctx->sq_data = NULL;
7110 	}
7111 }
7112 
io_finish_async(struct io_ring_ctx * ctx)7113 static void io_finish_async(struct io_ring_ctx *ctx)
7114 {
7115 	io_sq_thread_stop(ctx);
7116 
7117 	if (ctx->io_wq) {
7118 		io_wq_destroy(ctx->io_wq);
7119 		ctx->io_wq = NULL;
7120 	}
7121 }
7122 
7123 #if defined(CONFIG_UNIX)
7124 /*
7125  * Ensure the UNIX gc is aware of our file set, so we are certain that
7126  * the io_uring can be safely unregistered on process exit, even if we have
7127  * loops in the file referencing.
7128  */
__io_sqe_files_scm(struct io_ring_ctx * ctx,int nr,int offset)7129 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7130 {
7131 	struct sock *sk = ctx->ring_sock->sk;
7132 	struct scm_fp_list *fpl;
7133 	struct sk_buff *skb;
7134 	int i, nr_files;
7135 
7136 	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7137 	if (!fpl)
7138 		return -ENOMEM;
7139 
7140 	skb = alloc_skb(0, GFP_KERNEL);
7141 	if (!skb) {
7142 		kfree(fpl);
7143 		return -ENOMEM;
7144 	}
7145 
7146 	skb->sk = sk;
7147 
7148 	nr_files = 0;
7149 	fpl->user = get_uid(ctx->user);
7150 	for (i = 0; i < nr; i++) {
7151 		struct file *file = io_file_from_index(ctx, i + offset);
7152 
7153 		if (!file)
7154 			continue;
7155 		fpl->fp[nr_files] = get_file(file);
7156 		unix_inflight(fpl->user, fpl->fp[nr_files]);
7157 		nr_files++;
7158 	}
7159 
7160 	if (nr_files) {
7161 		fpl->max = SCM_MAX_FD;
7162 		fpl->count = nr_files;
7163 		UNIXCB(skb).fp = fpl;
7164 		skb->destructor = unix_destruct_scm;
7165 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7166 		skb_queue_head(&sk->sk_receive_queue, skb);
7167 
7168 		for (i = 0; i < nr_files; i++)
7169 			fput(fpl->fp[i]);
7170 	} else {
7171 		kfree_skb(skb);
7172 		kfree(fpl);
7173 	}
7174 
7175 	return 0;
7176 }
7177 
7178 /*
7179  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7180  * causes regular reference counting to break down. We rely on the UNIX
7181  * garbage collection to take care of this problem for us.
7182  */
io_sqe_files_scm(struct io_ring_ctx * ctx)7183 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7184 {
7185 	unsigned left, total;
7186 	int ret = 0;
7187 
7188 	total = 0;
7189 	left = ctx->nr_user_files;
7190 	while (left) {
7191 		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7192 
7193 		ret = __io_sqe_files_scm(ctx, this_files, total);
7194 		if (ret)
7195 			break;
7196 		left -= this_files;
7197 		total += this_files;
7198 	}
7199 
7200 	if (!ret)
7201 		return 0;
7202 
7203 	while (total < ctx->nr_user_files) {
7204 		struct file *file = io_file_from_index(ctx, total);
7205 
7206 		if (file)
7207 			fput(file);
7208 		total++;
7209 	}
7210 
7211 	return ret;
7212 }
7213 #else
io_sqe_files_scm(struct io_ring_ctx * ctx)7214 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7215 {
7216 	return 0;
7217 }
7218 #endif
7219 
io_sqe_alloc_file_tables(struct fixed_file_data * file_data,unsigned nr_tables,unsigned nr_files)7220 static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
7221 				    unsigned nr_tables, unsigned nr_files)
7222 {
7223 	int i;
7224 
7225 	for (i = 0; i < nr_tables; i++) {
7226 		struct fixed_file_table *table = &file_data->table[i];
7227 		unsigned this_files;
7228 
7229 		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7230 		table->files = kcalloc(this_files, sizeof(struct file *),
7231 					GFP_KERNEL);
7232 		if (!table->files)
7233 			break;
7234 		nr_files -= this_files;
7235 	}
7236 
7237 	if (i == nr_tables)
7238 		return 0;
7239 
7240 	for (i = 0; i < nr_tables; i++) {
7241 		struct fixed_file_table *table = &file_data->table[i];
7242 		kfree(table->files);
7243 	}
7244 	return 1;
7245 }
7246 
io_ring_file_put(struct io_ring_ctx * ctx,struct file * file)7247 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
7248 {
7249 #if defined(CONFIG_UNIX)
7250 	struct sock *sock = ctx->ring_sock->sk;
7251 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
7252 	struct sk_buff *skb;
7253 	int i;
7254 
7255 	__skb_queue_head_init(&list);
7256 
7257 	/*
7258 	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
7259 	 * remove this entry and rearrange the file array.
7260 	 */
7261 	skb = skb_dequeue(head);
7262 	while (skb) {
7263 		struct scm_fp_list *fp;
7264 
7265 		fp = UNIXCB(skb).fp;
7266 		for (i = 0; i < fp->count; i++) {
7267 			int left;
7268 
7269 			if (fp->fp[i] != file)
7270 				continue;
7271 
7272 			unix_notinflight(fp->user, fp->fp[i]);
7273 			left = fp->count - 1 - i;
7274 			if (left) {
7275 				memmove(&fp->fp[i], &fp->fp[i + 1],
7276 						left * sizeof(struct file *));
7277 			}
7278 			fp->count--;
7279 			if (!fp->count) {
7280 				kfree_skb(skb);
7281 				skb = NULL;
7282 			} else {
7283 				__skb_queue_tail(&list, skb);
7284 			}
7285 			fput(file);
7286 			file = NULL;
7287 			break;
7288 		}
7289 
7290 		if (!file)
7291 			break;
7292 
7293 		__skb_queue_tail(&list, skb);
7294 
7295 		skb = skb_dequeue(head);
7296 	}
7297 
7298 	if (skb_peek(&list)) {
7299 		spin_lock_irq(&head->lock);
7300 		while ((skb = __skb_dequeue(&list)) != NULL)
7301 			__skb_queue_tail(head, skb);
7302 		spin_unlock_irq(&head->lock);
7303 	}
7304 #else
7305 	fput(file);
7306 #endif
7307 }
7308 
7309 struct io_file_put {
7310 	struct list_head list;
7311 	struct file *file;
7312 };
7313 
__io_file_put_work(struct fixed_file_ref_node * ref_node)7314 static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
7315 {
7316 	struct fixed_file_data *file_data = ref_node->file_data;
7317 	struct io_ring_ctx *ctx = file_data->ctx;
7318 	struct io_file_put *pfile, *tmp;
7319 
7320 	list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
7321 		list_del(&pfile->list);
7322 		io_ring_file_put(ctx, pfile->file);
7323 		kfree(pfile);
7324 	}
7325 
7326 	percpu_ref_exit(&ref_node->refs);
7327 	kfree(ref_node);
7328 	percpu_ref_put(&file_data->refs);
7329 }
7330 
io_file_put_work(struct work_struct * work)7331 static void io_file_put_work(struct work_struct *work)
7332 {
7333 	struct io_ring_ctx *ctx;
7334 	struct llist_node *node;
7335 
7336 	ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
7337 	node = llist_del_all(&ctx->file_put_llist);
7338 
7339 	while (node) {
7340 		struct fixed_file_ref_node *ref_node;
7341 		struct llist_node *next = node->next;
7342 
7343 		ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
7344 		__io_file_put_work(ref_node);
7345 		node = next;
7346 	}
7347 }
7348 
io_file_data_ref_zero(struct percpu_ref * ref)7349 static void io_file_data_ref_zero(struct percpu_ref *ref)
7350 {
7351 	struct fixed_file_ref_node *ref_node;
7352 	struct fixed_file_data *data;
7353 	struct io_ring_ctx *ctx;
7354 	bool first_add = false;
7355 	int delay = HZ;
7356 
7357 	ref_node = container_of(ref, struct fixed_file_ref_node, refs);
7358 	data = ref_node->file_data;
7359 	ctx = data->ctx;
7360 
7361 	spin_lock(&data->lock);
7362 	ref_node->done = true;
7363 
7364 	while (!list_empty(&data->ref_list)) {
7365 		ref_node = list_first_entry(&data->ref_list,
7366 					struct fixed_file_ref_node, node);
7367 		/* recycle ref nodes in order */
7368 		if (!ref_node->done)
7369 			break;
7370 		list_del(&ref_node->node);
7371 		first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
7372 	}
7373 	spin_unlock(&data->lock);
7374 
7375 	if (percpu_ref_is_dying(&data->refs))
7376 		delay = 0;
7377 
7378 	if (!delay)
7379 		mod_delayed_work(system_wq, &ctx->file_put_work, 0);
7380 	else if (first_add)
7381 		queue_delayed_work(system_wq, &ctx->file_put_work, delay);
7382 }
7383 
alloc_fixed_file_ref_node(struct io_ring_ctx * ctx)7384 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
7385 			struct io_ring_ctx *ctx)
7386 {
7387 	struct fixed_file_ref_node *ref_node;
7388 
7389 	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7390 	if (!ref_node)
7391 		return ERR_PTR(-ENOMEM);
7392 
7393 	if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
7394 			    0, GFP_KERNEL)) {
7395 		kfree(ref_node);
7396 		return ERR_PTR(-ENOMEM);
7397 	}
7398 	INIT_LIST_HEAD(&ref_node->node);
7399 	INIT_LIST_HEAD(&ref_node->file_list);
7400 	ref_node->file_data = ctx->file_data;
7401 	ref_node->done = false;
7402 	return ref_node;
7403 }
7404 
destroy_fixed_file_ref_node(struct fixed_file_ref_node * ref_node)7405 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
7406 {
7407 	percpu_ref_exit(&ref_node->refs);
7408 	kfree(ref_node);
7409 }
7410 
io_sqe_files_register(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)7411 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7412 				 unsigned nr_args)
7413 {
7414 	__s32 __user *fds = (__s32 __user *) arg;
7415 	unsigned nr_tables, i;
7416 	struct file *file;
7417 	int fd, ret = -ENOMEM;
7418 	struct fixed_file_ref_node *ref_node;
7419 	struct fixed_file_data *file_data;
7420 
7421 	if (ctx->file_data)
7422 		return -EBUSY;
7423 	if (!nr_args)
7424 		return -EINVAL;
7425 	if (nr_args > IORING_MAX_FIXED_FILES)
7426 		return -EMFILE;
7427 
7428 	file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
7429 	if (!file_data)
7430 		return -ENOMEM;
7431 	file_data->ctx = ctx;
7432 	init_completion(&file_data->done);
7433 	INIT_LIST_HEAD(&file_data->ref_list);
7434 	spin_lock_init(&file_data->lock);
7435 
7436 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7437 	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7438 				   GFP_KERNEL);
7439 	if (!file_data->table)
7440 		goto out_free;
7441 
7442 	if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
7443 				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
7444 		goto out_free;
7445 
7446 	if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7447 		goto out_ref;
7448 	ctx->file_data = file_data;
7449 
7450 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7451 		struct fixed_file_table *table;
7452 		unsigned index;
7453 
7454 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7455 			ret = -EFAULT;
7456 			goto out_fput;
7457 		}
7458 		/* allow sparse sets */
7459 		if (fd == -1)
7460 			continue;
7461 
7462 		file = fget(fd);
7463 		ret = -EBADF;
7464 		if (!file)
7465 			goto out_fput;
7466 
7467 		/*
7468 		 * Don't allow io_uring instances to be registered. If UNIX
7469 		 * isn't enabled, then this causes a reference cycle and this
7470 		 * instance can never get freed. If UNIX is enabled we'll
7471 		 * handle it just fine, but there's still no point in allowing
7472 		 * a ring fd as it doesn't support regular read/write anyway.
7473 		 */
7474 		if (file->f_op == &io_uring_fops) {
7475 			fput(file);
7476 			goto out_fput;
7477 		}
7478 		table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7479 		index = i & IORING_FILE_TABLE_MASK;
7480 		table->files[index] = file;
7481 	}
7482 
7483 	ret = io_sqe_files_scm(ctx);
7484 	if (ret) {
7485 		io_sqe_files_unregister(ctx);
7486 		return ret;
7487 	}
7488 
7489 	ref_node = alloc_fixed_file_ref_node(ctx);
7490 	if (IS_ERR(ref_node)) {
7491 		io_sqe_files_unregister(ctx);
7492 		return PTR_ERR(ref_node);
7493 	}
7494 
7495 	file_data->node = ref_node;
7496 	spin_lock(&file_data->lock);
7497 	list_add_tail(&ref_node->node, &file_data->ref_list);
7498 	spin_unlock(&file_data->lock);
7499 	percpu_ref_get(&file_data->refs);
7500 	return ret;
7501 out_fput:
7502 	for (i = 0; i < ctx->nr_user_files; i++) {
7503 		file = io_file_from_index(ctx, i);
7504 		if (file)
7505 			fput(file);
7506 	}
7507 	for (i = 0; i < nr_tables; i++)
7508 		kfree(file_data->table[i].files);
7509 	ctx->nr_user_files = 0;
7510 out_ref:
7511 	percpu_ref_exit(&file_data->refs);
7512 out_free:
7513 	kfree(file_data->table);
7514 	kfree(file_data);
7515 	ctx->file_data = NULL;
7516 	return ret;
7517 }
7518 
io_sqe_file_register(struct io_ring_ctx * ctx,struct file * file,int index)7519 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7520 				int index)
7521 {
7522 #if defined(CONFIG_UNIX)
7523 	struct sock *sock = ctx->ring_sock->sk;
7524 	struct sk_buff_head *head = &sock->sk_receive_queue;
7525 	struct sk_buff *skb;
7526 
7527 	/*
7528 	 * See if we can merge this file into an existing skb SCM_RIGHTS
7529 	 * file set. If there's no room, fall back to allocating a new skb
7530 	 * and filling it in.
7531 	 */
7532 	spin_lock_irq(&head->lock);
7533 	skb = skb_peek(head);
7534 	if (skb) {
7535 		struct scm_fp_list *fpl = UNIXCB(skb).fp;
7536 
7537 		if (fpl->count < SCM_MAX_FD) {
7538 			__skb_unlink(skb, head);
7539 			spin_unlock_irq(&head->lock);
7540 			fpl->fp[fpl->count] = get_file(file);
7541 			unix_inflight(fpl->user, fpl->fp[fpl->count]);
7542 			fpl->count++;
7543 			spin_lock_irq(&head->lock);
7544 			__skb_queue_head(head, skb);
7545 		} else {
7546 			skb = NULL;
7547 		}
7548 	}
7549 	spin_unlock_irq(&head->lock);
7550 
7551 	if (skb) {
7552 		fput(file);
7553 		return 0;
7554 	}
7555 
7556 	return __io_sqe_files_scm(ctx, 1, index);
7557 #else
7558 	return 0;
7559 #endif
7560 }
7561 
io_queue_file_removal(struct fixed_file_data * data,struct file * file)7562 static int io_queue_file_removal(struct fixed_file_data *data,
7563 				 struct file *file)
7564 {
7565 	struct io_file_put *pfile;
7566 	struct fixed_file_ref_node *ref_node = data->node;
7567 
7568 	pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
7569 	if (!pfile)
7570 		return -ENOMEM;
7571 
7572 	pfile->file = file;
7573 	list_add(&pfile->list, &ref_node->file_list);
7574 
7575 	return 0;
7576 }
7577 
__io_sqe_files_update(struct io_ring_ctx * ctx,struct io_uring_files_update * up,unsigned nr_args)7578 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7579 				 struct io_uring_files_update *up,
7580 				 unsigned nr_args)
7581 {
7582 	struct fixed_file_data *data = ctx->file_data;
7583 	struct fixed_file_ref_node *ref_node;
7584 	struct file *file;
7585 	__s32 __user *fds;
7586 	int fd, i, err;
7587 	__u32 done;
7588 	bool needs_switch = false;
7589 
7590 	if (check_add_overflow(up->offset, nr_args, &done))
7591 		return -EOVERFLOW;
7592 	if (done > ctx->nr_user_files)
7593 		return -EINVAL;
7594 
7595 	ref_node = alloc_fixed_file_ref_node(ctx);
7596 	if (IS_ERR(ref_node))
7597 		return PTR_ERR(ref_node);
7598 
7599 	done = 0;
7600 	fds = u64_to_user_ptr(up->fds);
7601 	while (nr_args) {
7602 		struct fixed_file_table *table;
7603 		unsigned index;
7604 
7605 		err = 0;
7606 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7607 			err = -EFAULT;
7608 			break;
7609 		}
7610 		i = array_index_nospec(up->offset, ctx->nr_user_files);
7611 		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7612 		index = i & IORING_FILE_TABLE_MASK;
7613 		if (table->files[index]) {
7614 			file = table->files[index];
7615 			err = io_queue_file_removal(data, file);
7616 			if (err)
7617 				break;
7618 			table->files[index] = NULL;
7619 			needs_switch = true;
7620 		}
7621 		if (fd != -1) {
7622 			file = fget(fd);
7623 			if (!file) {
7624 				err = -EBADF;
7625 				break;
7626 			}
7627 			/*
7628 			 * Don't allow io_uring instances to be registered. If
7629 			 * UNIX isn't enabled, then this causes a reference
7630 			 * cycle and this instance can never get freed. If UNIX
7631 			 * is enabled we'll handle it just fine, but there's
7632 			 * still no point in allowing a ring fd as it doesn't
7633 			 * support regular read/write anyway.
7634 			 */
7635 			if (file->f_op == &io_uring_fops) {
7636 				fput(file);
7637 				err = -EBADF;
7638 				break;
7639 			}
7640 			table->files[index] = file;
7641 			err = io_sqe_file_register(ctx, file, i);
7642 			if (err) {
7643 				table->files[index] = NULL;
7644 				fput(file);
7645 				break;
7646 			}
7647 		}
7648 		nr_args--;
7649 		done++;
7650 		up->offset++;
7651 	}
7652 
7653 	if (needs_switch) {
7654 		percpu_ref_kill(&data->node->refs);
7655 		spin_lock(&data->lock);
7656 		list_add_tail(&ref_node->node, &data->ref_list);
7657 		data->node = ref_node;
7658 		spin_unlock(&data->lock);
7659 		percpu_ref_get(&ctx->file_data->refs);
7660 	} else
7661 		destroy_fixed_file_ref_node(ref_node);
7662 
7663 	return done ? done : err;
7664 }
7665 
io_sqe_files_update(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)7666 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7667 			       unsigned nr_args)
7668 {
7669 	struct io_uring_files_update up;
7670 
7671 	if (!ctx->file_data)
7672 		return -ENXIO;
7673 	if (!nr_args)
7674 		return -EINVAL;
7675 	if (copy_from_user(&up, arg, sizeof(up)))
7676 		return -EFAULT;
7677 	if (up.resv)
7678 		return -EINVAL;
7679 
7680 	return __io_sqe_files_update(ctx, &up, nr_args);
7681 }
7682 
io_free_work(struct io_wq_work * work)7683 static void io_free_work(struct io_wq_work *work)
7684 {
7685 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7686 
7687 	/* Consider that io_steal_work() relies on this ref */
7688 	io_put_req(req);
7689 }
7690 
io_init_wq_offload(struct io_ring_ctx * ctx,struct io_uring_params * p)7691 static int io_init_wq_offload(struct io_ring_ctx *ctx,
7692 			      struct io_uring_params *p)
7693 {
7694 	struct io_wq_data data;
7695 	struct fd f;
7696 	struct io_ring_ctx *ctx_attach;
7697 	unsigned int concurrency;
7698 	int ret = 0;
7699 
7700 	data.user = ctx->user;
7701 	data.free_work = io_free_work;
7702 	data.do_work = io_wq_submit_work;
7703 
7704 	if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
7705 		/* Do QD, or 4 * CPUS, whatever is smallest */
7706 		concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7707 
7708 		ctx->io_wq = io_wq_create(concurrency, &data);
7709 		if (IS_ERR(ctx->io_wq)) {
7710 			ret = PTR_ERR(ctx->io_wq);
7711 			ctx->io_wq = NULL;
7712 		}
7713 		return ret;
7714 	}
7715 
7716 	f = fdget(p->wq_fd);
7717 	if (!f.file)
7718 		return -EBADF;
7719 
7720 	if (f.file->f_op != &io_uring_fops) {
7721 		ret = -EINVAL;
7722 		goto out_fput;
7723 	}
7724 
7725 	ctx_attach = f.file->private_data;
7726 	/* @io_wq is protected by holding the fd */
7727 	if (!io_wq_get(ctx_attach->io_wq, &data)) {
7728 		ret = -EINVAL;
7729 		goto out_fput;
7730 	}
7731 
7732 	ctx->io_wq = ctx_attach->io_wq;
7733 out_fput:
7734 	fdput(f);
7735 	return ret;
7736 }
7737 
io_uring_alloc_task_context(struct task_struct * task)7738 static int io_uring_alloc_task_context(struct task_struct *task)
7739 {
7740 	struct io_uring_task *tctx;
7741 	int ret;
7742 
7743 	tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7744 	if (unlikely(!tctx))
7745 		return -ENOMEM;
7746 
7747 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7748 	if (unlikely(ret)) {
7749 		kfree(tctx);
7750 		return ret;
7751 	}
7752 
7753 	xa_init(&tctx->xa);
7754 	init_waitqueue_head(&tctx->wait);
7755 	tctx->last = NULL;
7756 	atomic_set(&tctx->in_idle, 0);
7757 	tctx->sqpoll = false;
7758 	io_init_identity(&tctx->__identity);
7759 	tctx->identity = &tctx->__identity;
7760 	task->io_uring = tctx;
7761 	return 0;
7762 }
7763 
__io_uring_free(struct task_struct * tsk)7764 void __io_uring_free(struct task_struct *tsk)
7765 {
7766 	struct io_uring_task *tctx = tsk->io_uring;
7767 
7768 	WARN_ON_ONCE(!xa_empty(&tctx->xa));
7769 	WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
7770 	if (tctx->identity != &tctx->__identity)
7771 		kfree(tctx->identity);
7772 	percpu_counter_destroy(&tctx->inflight);
7773 	kfree(tctx);
7774 	tsk->io_uring = NULL;
7775 }
7776 
io_sq_offload_create(struct io_ring_ctx * ctx,struct io_uring_params * p)7777 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7778 				struct io_uring_params *p)
7779 {
7780 	int ret;
7781 
7782 	if (ctx->flags & IORING_SETUP_SQPOLL) {
7783 		struct io_sq_data *sqd;
7784 
7785 		ret = -EPERM;
7786 		if (!capable(CAP_SYS_ADMIN))
7787 			goto err;
7788 
7789 		sqd = io_get_sq_data(p);
7790 		if (IS_ERR(sqd)) {
7791 			ret = PTR_ERR(sqd);
7792 			goto err;
7793 		}
7794 
7795 		ctx->sq_data = sqd;
7796 		io_sq_thread_park(sqd);
7797 		mutex_lock(&sqd->ctx_lock);
7798 		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7799 		mutex_unlock(&sqd->ctx_lock);
7800 		io_sq_thread_unpark(sqd);
7801 
7802 		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7803 		if (!ctx->sq_thread_idle)
7804 			ctx->sq_thread_idle = HZ;
7805 
7806 		if (sqd->thread)
7807 			goto done;
7808 
7809 		if (p->flags & IORING_SETUP_SQ_AFF) {
7810 			int cpu = p->sq_thread_cpu;
7811 
7812 			ret = -EINVAL;
7813 			if (cpu >= nr_cpu_ids)
7814 				goto err;
7815 			if (!cpu_online(cpu))
7816 				goto err;
7817 
7818 			sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
7819 							cpu, "io_uring-sq");
7820 		} else {
7821 			sqd->thread = kthread_create(io_sq_thread, sqd,
7822 							"io_uring-sq");
7823 		}
7824 		if (IS_ERR(sqd->thread)) {
7825 			ret = PTR_ERR(sqd->thread);
7826 			sqd->thread = NULL;
7827 			goto err;
7828 		}
7829 		ret = io_uring_alloc_task_context(sqd->thread);
7830 		if (ret)
7831 			goto err;
7832 	} else if (p->flags & IORING_SETUP_SQ_AFF) {
7833 		/* Can't have SQ_AFF without SQPOLL */
7834 		ret = -EINVAL;
7835 		goto err;
7836 	}
7837 
7838 done:
7839 	ret = io_init_wq_offload(ctx, p);
7840 	if (ret)
7841 		goto err;
7842 
7843 	return 0;
7844 err:
7845 	io_finish_async(ctx);
7846 	return ret;
7847 }
7848 
io_sq_offload_start(struct io_ring_ctx * ctx)7849 static void io_sq_offload_start(struct io_ring_ctx *ctx)
7850 {
7851 	struct io_sq_data *sqd = ctx->sq_data;
7852 
7853 	if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
7854 		wake_up_process(sqd->thread);
7855 }
7856 
__io_unaccount_mem(struct user_struct * user,unsigned long nr_pages)7857 static inline void __io_unaccount_mem(struct user_struct *user,
7858 				      unsigned long nr_pages)
7859 {
7860 	atomic_long_sub(nr_pages, &user->locked_vm);
7861 }
7862 
__io_account_mem(struct user_struct * user,unsigned long nr_pages)7863 static inline int __io_account_mem(struct user_struct *user,
7864 				   unsigned long nr_pages)
7865 {
7866 	unsigned long page_limit, cur_pages, new_pages;
7867 
7868 	/* Don't allow more pages than we can safely lock */
7869 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7870 
7871 	do {
7872 		cur_pages = atomic_long_read(&user->locked_vm);
7873 		new_pages = cur_pages + nr_pages;
7874 		if (new_pages > page_limit)
7875 			return -ENOMEM;
7876 	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7877 					new_pages) != cur_pages);
7878 
7879 	return 0;
7880 }
7881 
io_unaccount_mem(struct io_ring_ctx * ctx,unsigned long nr_pages,enum io_mem_account acct)7882 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7883 			     enum io_mem_account acct)
7884 {
7885 	if (ctx->limit_mem)
7886 		__io_unaccount_mem(ctx->user, nr_pages);
7887 
7888 	if (ctx->mm_account) {
7889 		if (acct == ACCT_LOCKED)
7890 			ctx->mm_account->locked_vm -= nr_pages;
7891 		else if (acct == ACCT_PINNED)
7892 			atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7893 	}
7894 }
7895 
io_account_mem(struct io_ring_ctx * ctx,unsigned long nr_pages,enum io_mem_account acct)7896 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7897 			  enum io_mem_account acct)
7898 {
7899 	int ret;
7900 
7901 	if (ctx->limit_mem) {
7902 		ret = __io_account_mem(ctx->user, nr_pages);
7903 		if (ret)
7904 			return ret;
7905 	}
7906 
7907 	if (ctx->mm_account) {
7908 		if (acct == ACCT_LOCKED)
7909 			ctx->mm_account->locked_vm += nr_pages;
7910 		else if (acct == ACCT_PINNED)
7911 			atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7912 	}
7913 
7914 	return 0;
7915 }
7916 
io_mem_free(void * ptr)7917 static void io_mem_free(void *ptr)
7918 {
7919 	struct page *page;
7920 
7921 	if (!ptr)
7922 		return;
7923 
7924 	page = virt_to_head_page(ptr);
7925 	if (put_page_testzero(page))
7926 		free_compound_page(page);
7927 }
7928 
io_mem_alloc(size_t size)7929 static void *io_mem_alloc(size_t size)
7930 {
7931 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7932 				__GFP_NORETRY;
7933 
7934 	return (void *) __get_free_pages(gfp_flags, get_order(size));
7935 }
7936 
rings_size(unsigned sq_entries,unsigned cq_entries,size_t * sq_offset)7937 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7938 				size_t *sq_offset)
7939 {
7940 	struct io_rings *rings;
7941 	size_t off, sq_array_size;
7942 
7943 	off = struct_size(rings, cqes, cq_entries);
7944 	if (off == SIZE_MAX)
7945 		return SIZE_MAX;
7946 
7947 #ifdef CONFIG_SMP
7948 	off = ALIGN(off, SMP_CACHE_BYTES);
7949 	if (off == 0)
7950 		return SIZE_MAX;
7951 #endif
7952 
7953 	if (sq_offset)
7954 		*sq_offset = off;
7955 
7956 	sq_array_size = array_size(sizeof(u32), sq_entries);
7957 	if (sq_array_size == SIZE_MAX)
7958 		return SIZE_MAX;
7959 
7960 	if (check_add_overflow(off, sq_array_size, &off))
7961 		return SIZE_MAX;
7962 
7963 	return off;
7964 }
7965 
ring_pages(unsigned sq_entries,unsigned cq_entries)7966 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7967 {
7968 	size_t pages;
7969 
7970 	pages = (size_t)1 << get_order(
7971 		rings_size(sq_entries, cq_entries, NULL));
7972 	pages += (size_t)1 << get_order(
7973 		array_size(sizeof(struct io_uring_sqe), sq_entries));
7974 
7975 	return pages;
7976 }
7977 
io_sqe_buffer_unregister(struct io_ring_ctx * ctx)7978 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7979 {
7980 	int i, j;
7981 
7982 	if (!ctx->user_bufs)
7983 		return -ENXIO;
7984 
7985 	for (i = 0; i < ctx->nr_user_bufs; i++) {
7986 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7987 
7988 		for (j = 0; j < imu->nr_bvecs; j++)
7989 			unpin_user_page(imu->bvec[j].bv_page);
7990 
7991 		if (imu->acct_pages)
7992 			io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
7993 		kvfree(imu->bvec);
7994 		imu->nr_bvecs = 0;
7995 	}
7996 
7997 	kfree(ctx->user_bufs);
7998 	ctx->user_bufs = NULL;
7999 	ctx->nr_user_bufs = 0;
8000 	return 0;
8001 }
8002 
io_copy_iov(struct io_ring_ctx * ctx,struct iovec * dst,void __user * arg,unsigned index)8003 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8004 		       void __user *arg, unsigned index)
8005 {
8006 	struct iovec __user *src;
8007 
8008 #ifdef CONFIG_COMPAT
8009 	if (ctx->compat) {
8010 		struct compat_iovec __user *ciovs;
8011 		struct compat_iovec ciov;
8012 
8013 		ciovs = (struct compat_iovec __user *) arg;
8014 		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8015 			return -EFAULT;
8016 
8017 		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8018 		dst->iov_len = ciov.iov_len;
8019 		return 0;
8020 	}
8021 #endif
8022 	src = (struct iovec __user *) arg;
8023 	if (copy_from_user(dst, &src[index], sizeof(*dst)))
8024 		return -EFAULT;
8025 	return 0;
8026 }
8027 
8028 /*
8029  * Not super efficient, but this is just a registration time. And we do cache
8030  * the last compound head, so generally we'll only do a full search if we don't
8031  * match that one.
8032  *
8033  * We check if the given compound head page has already been accounted, to
8034  * avoid double accounting it. This allows us to account the full size of the
8035  * page, not just the constituent pages of a huge page.
8036  */
headpage_already_acct(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct page * hpage)8037 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8038 				  int nr_pages, struct page *hpage)
8039 {
8040 	int i, j;
8041 
8042 	/* check current page array */
8043 	for (i = 0; i < nr_pages; i++) {
8044 		if (!PageCompound(pages[i]))
8045 			continue;
8046 		if (compound_head(pages[i]) == hpage)
8047 			return true;
8048 	}
8049 
8050 	/* check previously registered pages */
8051 	for (i = 0; i < ctx->nr_user_bufs; i++) {
8052 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8053 
8054 		for (j = 0; j < imu->nr_bvecs; j++) {
8055 			if (!PageCompound(imu->bvec[j].bv_page))
8056 				continue;
8057 			if (compound_head(imu->bvec[j].bv_page) == hpage)
8058 				return true;
8059 		}
8060 	}
8061 
8062 	return false;
8063 }
8064 
io_buffer_account_pin(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct io_mapped_ubuf * imu,struct page ** last_hpage)8065 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8066 				 int nr_pages, struct io_mapped_ubuf *imu,
8067 				 struct page **last_hpage)
8068 {
8069 	int i, ret;
8070 
8071 	for (i = 0; i < nr_pages; i++) {
8072 		if (!PageCompound(pages[i])) {
8073 			imu->acct_pages++;
8074 		} else {
8075 			struct page *hpage;
8076 
8077 			hpage = compound_head(pages[i]);
8078 			if (hpage == *last_hpage)
8079 				continue;
8080 			*last_hpage = hpage;
8081 			if (headpage_already_acct(ctx, pages, i, hpage))
8082 				continue;
8083 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8084 		}
8085 	}
8086 
8087 	if (!imu->acct_pages)
8088 		return 0;
8089 
8090 	ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
8091 	if (ret)
8092 		imu->acct_pages = 0;
8093 	return ret;
8094 }
8095 
io_sqe_buffer_register(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)8096 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
8097 				  unsigned nr_args)
8098 {
8099 	struct vm_area_struct **vmas = NULL;
8100 	struct page **pages = NULL;
8101 	struct page *last_hpage = NULL;
8102 	int i, j, got_pages = 0;
8103 	int ret = -EINVAL;
8104 
8105 	if (ctx->user_bufs)
8106 		return -EBUSY;
8107 	if (!nr_args || nr_args > UIO_MAXIOV)
8108 		return -EINVAL;
8109 
8110 	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8111 					GFP_KERNEL);
8112 	if (!ctx->user_bufs)
8113 		return -ENOMEM;
8114 
8115 	for (i = 0; i < nr_args; i++) {
8116 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8117 		unsigned long off, start, end, ubuf;
8118 		int pret, nr_pages;
8119 		struct iovec iov;
8120 		size_t size;
8121 
8122 		ret = io_copy_iov(ctx, &iov, arg, i);
8123 		if (ret)
8124 			goto err;
8125 
8126 		/*
8127 		 * Don't impose further limits on the size and buffer
8128 		 * constraints here, we'll -EINVAL later when IO is
8129 		 * submitted if they are wrong.
8130 		 */
8131 		ret = -EFAULT;
8132 		if (!iov.iov_base || !iov.iov_len)
8133 			goto err;
8134 
8135 		/* arbitrary limit, but we need something */
8136 		if (iov.iov_len > SZ_1G)
8137 			goto err;
8138 
8139 		ubuf = (unsigned long) iov.iov_base;
8140 		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8141 		start = ubuf >> PAGE_SHIFT;
8142 		nr_pages = end - start;
8143 
8144 		ret = 0;
8145 		if (!pages || nr_pages > got_pages) {
8146 			kvfree(vmas);
8147 			kvfree(pages);
8148 			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
8149 						GFP_KERNEL);
8150 			vmas = kvmalloc_array(nr_pages,
8151 					sizeof(struct vm_area_struct *),
8152 					GFP_KERNEL);
8153 			if (!pages || !vmas) {
8154 				ret = -ENOMEM;
8155 				goto err;
8156 			}
8157 			got_pages = nr_pages;
8158 		}
8159 
8160 		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8161 						GFP_KERNEL);
8162 		ret = -ENOMEM;
8163 		if (!imu->bvec)
8164 			goto err;
8165 
8166 		ret = 0;
8167 		mmap_read_lock(current->mm);
8168 		pret = pin_user_pages(ubuf, nr_pages,
8169 				      FOLL_WRITE | FOLL_LONGTERM,
8170 				      pages, vmas);
8171 		if (pret == nr_pages) {
8172 			/* don't support file backed memory */
8173 			for (j = 0; j < nr_pages; j++) {
8174 				struct vm_area_struct *vma = vmas[j];
8175 
8176 				if (vma->vm_file &&
8177 				    !is_file_hugepages(vma->vm_file)) {
8178 					ret = -EOPNOTSUPP;
8179 					break;
8180 				}
8181 			}
8182 		} else {
8183 			ret = pret < 0 ? pret : -EFAULT;
8184 		}
8185 		mmap_read_unlock(current->mm);
8186 		if (ret) {
8187 			/*
8188 			 * if we did partial map, or found file backed vmas,
8189 			 * release any pages we did get
8190 			 */
8191 			if (pret > 0)
8192 				unpin_user_pages(pages, pret);
8193 			kvfree(imu->bvec);
8194 			goto err;
8195 		}
8196 
8197 		ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
8198 		if (ret) {
8199 			unpin_user_pages(pages, pret);
8200 			kvfree(imu->bvec);
8201 			goto err;
8202 		}
8203 
8204 		off = ubuf & ~PAGE_MASK;
8205 		size = iov.iov_len;
8206 		for (j = 0; j < nr_pages; j++) {
8207 			size_t vec_len;
8208 
8209 			vec_len = min_t(size_t, size, PAGE_SIZE - off);
8210 			imu->bvec[j].bv_page = pages[j];
8211 			imu->bvec[j].bv_len = vec_len;
8212 			imu->bvec[j].bv_offset = off;
8213 			off = 0;
8214 			size -= vec_len;
8215 		}
8216 		/* store original address for later verification */
8217 		imu->ubuf = ubuf;
8218 		imu->len = iov.iov_len;
8219 		imu->nr_bvecs = nr_pages;
8220 
8221 		ctx->nr_user_bufs++;
8222 	}
8223 	kvfree(pages);
8224 	kvfree(vmas);
8225 	return 0;
8226 err:
8227 	kvfree(pages);
8228 	kvfree(vmas);
8229 	io_sqe_buffer_unregister(ctx);
8230 	return ret;
8231 }
8232 
io_eventfd_register(struct io_ring_ctx * ctx,void __user * arg)8233 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8234 {
8235 	__s32 __user *fds = arg;
8236 	int fd;
8237 
8238 	if (ctx->cq_ev_fd)
8239 		return -EBUSY;
8240 
8241 	if (copy_from_user(&fd, fds, sizeof(*fds)))
8242 		return -EFAULT;
8243 
8244 	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8245 	if (IS_ERR(ctx->cq_ev_fd)) {
8246 		int ret = PTR_ERR(ctx->cq_ev_fd);
8247 		ctx->cq_ev_fd = NULL;
8248 		return ret;
8249 	}
8250 
8251 	return 0;
8252 }
8253 
io_eventfd_unregister(struct io_ring_ctx * ctx)8254 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8255 {
8256 	if (ctx->cq_ev_fd) {
8257 		eventfd_ctx_put(ctx->cq_ev_fd);
8258 		ctx->cq_ev_fd = NULL;
8259 		return 0;
8260 	}
8261 
8262 	return -ENXIO;
8263 }
8264 
__io_destroy_buffers(int id,void * p,void * data)8265 static int __io_destroy_buffers(int id, void *p, void *data)
8266 {
8267 	struct io_ring_ctx *ctx = data;
8268 	struct io_buffer *buf = p;
8269 
8270 	__io_remove_buffers(ctx, buf, id, -1U);
8271 	return 0;
8272 }
8273 
io_destroy_buffers(struct io_ring_ctx * ctx)8274 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8275 {
8276 	idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8277 	idr_destroy(&ctx->io_buffer_idr);
8278 }
8279 
io_ring_ctx_free(struct io_ring_ctx * ctx)8280 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8281 {
8282 	io_finish_async(ctx);
8283 	io_sqe_buffer_unregister(ctx);
8284 
8285 	if (ctx->sqo_task) {
8286 		put_task_struct(ctx->sqo_task);
8287 		ctx->sqo_task = NULL;
8288 		mmdrop(ctx->mm_account);
8289 		ctx->mm_account = NULL;
8290 	}
8291 
8292 #ifdef CONFIG_BLK_CGROUP
8293 	if (ctx->sqo_blkcg_css)
8294 		css_put(ctx->sqo_blkcg_css);
8295 #endif
8296 
8297 	io_sqe_files_unregister(ctx);
8298 	io_eventfd_unregister(ctx);
8299 	io_destroy_buffers(ctx);
8300 	idr_destroy(&ctx->personality_idr);
8301 
8302 #if defined(CONFIG_UNIX)
8303 	if (ctx->ring_sock) {
8304 		ctx->ring_sock->file = NULL; /* so that iput() is called */
8305 		sock_release(ctx->ring_sock);
8306 	}
8307 #endif
8308 
8309 	io_mem_free(ctx->rings);
8310 	io_mem_free(ctx->sq_sqes);
8311 
8312 	percpu_ref_exit(&ctx->refs);
8313 	free_uid(ctx->user);
8314 	put_cred(ctx->creds);
8315 	kfree(ctx->cancel_hash);
8316 	kmem_cache_free(req_cachep, ctx->fallback_req);
8317 	kfree(ctx);
8318 }
8319 
io_uring_poll(struct file * file,poll_table * wait)8320 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8321 {
8322 	struct io_ring_ctx *ctx = file->private_data;
8323 	__poll_t mask = 0;
8324 
8325 	poll_wait(file, &ctx->cq_wait, wait);
8326 	/*
8327 	 * synchronizes with barrier from wq_has_sleeper call in
8328 	 * io_commit_cqring
8329 	 */
8330 	smp_rmb();
8331 	if (!io_sqring_full(ctx))
8332 		mask |= EPOLLOUT | EPOLLWRNORM;
8333 	if (io_cqring_events(ctx, false))
8334 		mask |= EPOLLIN | EPOLLRDNORM;
8335 
8336 	return mask;
8337 }
8338 
io_uring_fasync(int fd,struct file * file,int on)8339 static int io_uring_fasync(int fd, struct file *file, int on)
8340 {
8341 	struct io_ring_ctx *ctx = file->private_data;
8342 
8343 	return fasync_helper(fd, file, on, &ctx->cq_fasync);
8344 }
8345 
io_remove_personalities(int id,void * p,void * data)8346 static int io_remove_personalities(int id, void *p, void *data)
8347 {
8348 	struct io_ring_ctx *ctx = data;
8349 	struct io_identity *iod;
8350 
8351 	iod = idr_remove(&ctx->personality_idr, id);
8352 	if (iod) {
8353 		put_cred(iod->creds);
8354 		if (refcount_dec_and_test(&iod->count))
8355 			kfree(iod);
8356 	}
8357 	return 0;
8358 }
8359 
io_ring_exit_work(struct work_struct * work)8360 static void io_ring_exit_work(struct work_struct *work)
8361 {
8362 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8363 					       exit_work);
8364 
8365 	/*
8366 	 * If we're doing polled IO and end up having requests being
8367 	 * submitted async (out-of-line), then completions can come in while
8368 	 * we're waiting for refs to drop. We need to reap these manually,
8369 	 * as nobody else will be looking for them.
8370 	 */
8371 	do {
8372 		if (ctx->rings)
8373 			io_cqring_overflow_flush(ctx, true, NULL, NULL);
8374 		io_iopoll_try_reap_events(ctx);
8375 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8376 	io_ring_ctx_free(ctx);
8377 }
8378 
io_ring_ctx_wait_and_kill(struct io_ring_ctx * ctx)8379 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8380 {
8381 	mutex_lock(&ctx->uring_lock);
8382 	percpu_ref_kill(&ctx->refs);
8383 	mutex_unlock(&ctx->uring_lock);
8384 
8385 	io_kill_timeouts(ctx, NULL);
8386 	io_poll_remove_all(ctx, NULL);
8387 
8388 	if (ctx->io_wq)
8389 		io_wq_cancel_all(ctx->io_wq);
8390 
8391 	/* if we failed setting up the ctx, we might not have any rings */
8392 	if (ctx->rings)
8393 		io_cqring_overflow_flush(ctx, true, NULL, NULL);
8394 	io_iopoll_try_reap_events(ctx);
8395 	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8396 
8397 	/*
8398 	 * Do this upfront, so we won't have a grace period where the ring
8399 	 * is closed but resources aren't reaped yet. This can cause
8400 	 * spurious failure in setting up a new ring.
8401 	 */
8402 	io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
8403 			 ACCT_LOCKED);
8404 
8405 	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8406 	/*
8407 	 * Use system_unbound_wq to avoid spawning tons of event kworkers
8408 	 * if we're exiting a ton of rings at the same time. It just adds
8409 	 * noise and overhead, there's no discernable change in runtime
8410 	 * over using system_wq.
8411 	 */
8412 	queue_work(system_unbound_wq, &ctx->exit_work);
8413 }
8414 
io_uring_release(struct inode * inode,struct file * file)8415 static int io_uring_release(struct inode *inode, struct file *file)
8416 {
8417 	struct io_ring_ctx *ctx = file->private_data;
8418 
8419 	file->private_data = NULL;
8420 	io_ring_ctx_wait_and_kill(ctx);
8421 	return 0;
8422 }
8423 
io_wq_files_match(struct io_wq_work * work,void * data)8424 static bool io_wq_files_match(struct io_wq_work *work, void *data)
8425 {
8426 	struct files_struct *files = data;
8427 
8428 	return !files || ((work->flags & IO_WQ_WORK_FILES) &&
8429 				work->identity->files == files);
8430 }
8431 
8432 /*
8433  * Returns true if 'preq' is the link parent of 'req'
8434  */
io_match_link(struct io_kiocb * preq,struct io_kiocb * req)8435 static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
8436 {
8437 	struct io_kiocb *link;
8438 
8439 	if (!(preq->flags & REQ_F_LINK_HEAD))
8440 		return false;
8441 
8442 	list_for_each_entry(link, &preq->link_list, link_list) {
8443 		if (link == req)
8444 			return true;
8445 	}
8446 
8447 	return false;
8448 }
8449 
8450 /*
8451  * We're looking to cancel 'req' because it's holding on to our files, but
8452  * 'req' could be a link to another request. See if it is, and cancel that
8453  * parent request if so.
8454  */
io_poll_remove_link(struct io_ring_ctx * ctx,struct io_kiocb * req)8455 static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
8456 {
8457 	struct hlist_node *tmp;
8458 	struct io_kiocb *preq;
8459 	bool found = false;
8460 	int i;
8461 
8462 	spin_lock_irq(&ctx->completion_lock);
8463 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
8464 		struct hlist_head *list;
8465 
8466 		list = &ctx->cancel_hash[i];
8467 		hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
8468 			found = io_match_link(preq, req);
8469 			if (found) {
8470 				io_poll_remove_one(preq);
8471 				break;
8472 			}
8473 		}
8474 	}
8475 	spin_unlock_irq(&ctx->completion_lock);
8476 	return found;
8477 }
8478 
io_timeout_remove_link(struct io_ring_ctx * ctx,struct io_kiocb * req)8479 static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
8480 				   struct io_kiocb *req)
8481 {
8482 	struct io_kiocb *preq;
8483 	bool found = false;
8484 
8485 	spin_lock_irq(&ctx->completion_lock);
8486 	list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
8487 		found = io_match_link(preq, req);
8488 		if (found) {
8489 			__io_timeout_cancel(preq);
8490 			break;
8491 		}
8492 	}
8493 	spin_unlock_irq(&ctx->completion_lock);
8494 	return found;
8495 }
8496 
io_cancel_link_cb(struct io_wq_work * work,void * data)8497 static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
8498 {
8499 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8500 	bool ret;
8501 
8502 	if (req->flags & REQ_F_LINK_TIMEOUT) {
8503 		unsigned long flags;
8504 		struct io_ring_ctx *ctx = req->ctx;
8505 
8506 		/* protect against races with linked timeouts */
8507 		spin_lock_irqsave(&ctx->completion_lock, flags);
8508 		ret = io_match_link(req, data);
8509 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
8510 	} else {
8511 		ret = io_match_link(req, data);
8512 	}
8513 	return ret;
8514 }
8515 
io_attempt_cancel(struct io_ring_ctx * ctx,struct io_kiocb * req)8516 static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
8517 {
8518 	enum io_wq_cancel cret;
8519 
8520 	/* cancel this particular work, if it's running */
8521 	cret = io_wq_cancel_work(ctx->io_wq, &req->work);
8522 	if (cret != IO_WQ_CANCEL_NOTFOUND)
8523 		return;
8524 
8525 	/* find links that hold this pending, cancel those */
8526 	cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
8527 	if (cret != IO_WQ_CANCEL_NOTFOUND)
8528 		return;
8529 
8530 	/* if we have a poll link holding this pending, cancel that */
8531 	if (io_poll_remove_link(ctx, req))
8532 		return;
8533 
8534 	/* final option, timeout link is holding this req pending */
8535 	io_timeout_remove_link(ctx, req);
8536 }
8537 
io_cancel_defer_files(struct io_ring_ctx * ctx,struct task_struct * task,struct files_struct * files)8538 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8539 				  struct task_struct *task,
8540 				  struct files_struct *files)
8541 {
8542 	struct io_defer_entry *de = NULL;
8543 	LIST_HEAD(list);
8544 
8545 	spin_lock_irq(&ctx->completion_lock);
8546 	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8547 		if (io_task_match(de->req, task) &&
8548 		    io_match_files(de->req, files)) {
8549 			list_cut_position(&list, &ctx->defer_list, &de->list);
8550 			break;
8551 		}
8552 	}
8553 	spin_unlock_irq(&ctx->completion_lock);
8554 
8555 	while (!list_empty(&list)) {
8556 		de = list_first_entry(&list, struct io_defer_entry, list);
8557 		list_del_init(&de->list);
8558 		req_set_fail_links(de->req);
8559 		io_put_req(de->req);
8560 		io_req_complete(de->req, -ECANCELED);
8561 		kfree(de);
8562 	}
8563 }
8564 
8565 /*
8566  * Returns true if we found and killed one or more files pinning requests
8567  */
io_uring_cancel_files(struct io_ring_ctx * ctx,struct files_struct * files)8568 static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
8569 				  struct files_struct *files)
8570 {
8571 	if (list_empty_careful(&ctx->inflight_list))
8572 		return false;
8573 
8574 	/* cancel all at once, should be faster than doing it one by one*/
8575 	io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
8576 
8577 	while (!list_empty_careful(&ctx->inflight_list)) {
8578 		struct io_kiocb *cancel_req = NULL, *req;
8579 		DEFINE_WAIT(wait);
8580 
8581 		spin_lock_irq(&ctx->inflight_lock);
8582 		list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
8583 			if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
8584 			    req->work.identity->files != files)
8585 				continue;
8586 			/* req is being completed, ignore */
8587 			if (!refcount_inc_not_zero(&req->refs))
8588 				continue;
8589 			cancel_req = req;
8590 			break;
8591 		}
8592 		if (cancel_req)
8593 			prepare_to_wait(&ctx->inflight_wait, &wait,
8594 						TASK_UNINTERRUPTIBLE);
8595 		spin_unlock_irq(&ctx->inflight_lock);
8596 
8597 		/* We need to keep going until we don't find a matching req */
8598 		if (!cancel_req)
8599 			break;
8600 		/* cancel this request, or head link requests */
8601 		io_attempt_cancel(ctx, cancel_req);
8602 		io_put_req(cancel_req);
8603 		/* cancellations _may_ trigger task work */
8604 		io_run_task_work();
8605 		schedule();
8606 		finish_wait(&ctx->inflight_wait, &wait);
8607 	}
8608 
8609 	return true;
8610 }
8611 
io_cancel_task_cb(struct io_wq_work * work,void * data)8612 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8613 {
8614 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8615 	struct task_struct *task = data;
8616 
8617 	return io_task_match(req, task);
8618 }
8619 
__io_uring_cancel_task_requests(struct io_ring_ctx * ctx,struct task_struct * task,struct files_struct * files)8620 static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8621 					    struct task_struct *task,
8622 					    struct files_struct *files)
8623 {
8624 	bool ret;
8625 
8626 	ret = io_uring_cancel_files(ctx, files);
8627 	if (!files) {
8628 		enum io_wq_cancel cret;
8629 
8630 		cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
8631 		if (cret != IO_WQ_CANCEL_NOTFOUND)
8632 			ret = true;
8633 
8634 		/* SQPOLL thread does its own polling */
8635 		if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
8636 			while (!list_empty_careful(&ctx->iopoll_list)) {
8637 				io_iopoll_try_reap_events(ctx);
8638 				ret = true;
8639 			}
8640 		}
8641 
8642 		ret |= io_poll_remove_all(ctx, task);
8643 		ret |= io_kill_timeouts(ctx, task);
8644 	}
8645 
8646 	return ret;
8647 }
8648 
8649 /*
8650  * We need to iteratively cancel requests, in case a request has dependent
8651  * hard links. These persist even for failure of cancelations, hence keep
8652  * looping until none are found.
8653  */
io_uring_cancel_task_requests(struct io_ring_ctx * ctx,struct files_struct * files)8654 static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8655 					  struct files_struct *files)
8656 {
8657 	struct task_struct *task = current;
8658 
8659 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8660 		task = ctx->sq_data->thread;
8661 		atomic_inc(&task->io_uring->in_idle);
8662 		io_sq_thread_park(ctx->sq_data);
8663 	}
8664 
8665 	if (files)
8666 		io_cancel_defer_files(ctx, NULL, files);
8667 	else
8668 		io_cancel_defer_files(ctx, task, NULL);
8669 
8670 	io_cqring_overflow_flush(ctx, true, task, files);
8671 
8672 	while (__io_uring_cancel_task_requests(ctx, task, files)) {
8673 		io_run_task_work();
8674 		cond_resched();
8675 	}
8676 
8677 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8678 		atomic_dec(&task->io_uring->in_idle);
8679 		/*
8680 		 * If the files that are going away are the ones in the thread
8681 		 * identity, clear them out.
8682 		 */
8683 		if (task->io_uring->identity->files == files)
8684 			task->io_uring->identity->files = NULL;
8685 		io_sq_thread_unpark(ctx->sq_data);
8686 	}
8687 }
8688 
8689 /*
8690  * Note that this task has used io_uring. We use it for cancelation purposes.
8691  */
io_uring_add_task_file(struct io_ring_ctx * ctx,struct file * file)8692 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
8693 {
8694 	struct io_uring_task *tctx = current->io_uring;
8695 
8696 	if (unlikely(!tctx)) {
8697 		int ret;
8698 
8699 		ret = io_uring_alloc_task_context(current);
8700 		if (unlikely(ret))
8701 			return ret;
8702 		tctx = current->io_uring;
8703 	}
8704 	if (tctx->last != file) {
8705 		void *old = xa_load(&tctx->xa, (unsigned long)file);
8706 
8707 		if (!old) {
8708 			get_file(file);
8709 			xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
8710 		}
8711 		tctx->last = file;
8712 	}
8713 
8714 	/*
8715 	 * This is race safe in that the task itself is doing this, hence it
8716 	 * cannot be going through the exit/cancel paths at the same time.
8717 	 * This cannot be modified while exit/cancel is running.
8718 	 */
8719 	if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
8720 		tctx->sqpoll = true;
8721 
8722 	return 0;
8723 }
8724 
8725 /*
8726  * Remove this io_uring_file -> task mapping.
8727  */
io_uring_del_task_file(struct file * file)8728 static void io_uring_del_task_file(struct file *file)
8729 {
8730 	struct io_uring_task *tctx = current->io_uring;
8731 
8732 	if (tctx->last == file)
8733 		tctx->last = NULL;
8734 	file = xa_erase(&tctx->xa, (unsigned long)file);
8735 	if (file)
8736 		fput(file);
8737 }
8738 
8739 /*
8740  * Drop task note for this file if we're the only ones that hold it after
8741  * pending fput()
8742  */
io_uring_attempt_task_drop(struct file * file)8743 static void io_uring_attempt_task_drop(struct file *file)
8744 {
8745 	if (!current->io_uring)
8746 		return;
8747 	/*
8748 	 * fput() is pending, will be 2 if the only other ref is our potential
8749 	 * task file note. If the task is exiting, drop regardless of count.
8750 	 */
8751 	if (fatal_signal_pending(current) || (current->flags & PF_EXITING) ||
8752 	    atomic_long_read(&file->f_count) == 2)
8753 		io_uring_del_task_file(file);
8754 }
8755 
__io_uring_files_cancel(struct files_struct * files)8756 void __io_uring_files_cancel(struct files_struct *files)
8757 {
8758 	struct io_uring_task *tctx = current->io_uring;
8759 	struct file *file;
8760 	unsigned long index;
8761 
8762 	/* make sure overflow events are dropped */
8763 	atomic_inc(&tctx->in_idle);
8764 
8765 	xa_for_each(&tctx->xa, index, file) {
8766 		struct io_ring_ctx *ctx = file->private_data;
8767 
8768 		io_uring_cancel_task_requests(ctx, files);
8769 		if (files)
8770 			io_uring_del_task_file(file);
8771 	}
8772 
8773 	atomic_dec(&tctx->in_idle);
8774 }
8775 
tctx_inflight(struct io_uring_task * tctx)8776 static s64 tctx_inflight(struct io_uring_task *tctx)
8777 {
8778 	unsigned long index;
8779 	struct file *file;
8780 	s64 inflight;
8781 
8782 	inflight = percpu_counter_sum(&tctx->inflight);
8783 	if (!tctx->sqpoll)
8784 		return inflight;
8785 
8786 	/*
8787 	 * If we have SQPOLL rings, then we need to iterate and find them, and
8788 	 * add the pending count for those.
8789 	 */
8790 	xa_for_each(&tctx->xa, index, file) {
8791 		struct io_ring_ctx *ctx = file->private_data;
8792 
8793 		if (ctx->flags & IORING_SETUP_SQPOLL) {
8794 			struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
8795 
8796 			inflight += percpu_counter_sum(&__tctx->inflight);
8797 		}
8798 	}
8799 
8800 	return inflight;
8801 }
8802 
8803 /*
8804  * Find any io_uring fd that this task has registered or done IO on, and cancel
8805  * requests.
8806  */
__io_uring_task_cancel(void)8807 void __io_uring_task_cancel(void)
8808 {
8809 	struct io_uring_task *tctx = current->io_uring;
8810 	DEFINE_WAIT(wait);
8811 	s64 inflight;
8812 
8813 	/* make sure overflow events are dropped */
8814 	atomic_inc(&tctx->in_idle);
8815 
8816 	do {
8817 		/* read completions before cancelations */
8818 		inflight = tctx_inflight(tctx);
8819 		if (!inflight)
8820 			break;
8821 		__io_uring_files_cancel(NULL);
8822 
8823 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8824 
8825 		/*
8826 		 * If we've seen completions, retry. This avoids a race where
8827 		 * a completion comes in before we did prepare_to_wait().
8828 		 */
8829 		if (inflight != tctx_inflight(tctx))
8830 			continue;
8831 		schedule();
8832 	} while (1);
8833 
8834 	finish_wait(&tctx->wait, &wait);
8835 	atomic_dec(&tctx->in_idle);
8836 }
8837 
io_uring_flush(struct file * file,void * data)8838 static int io_uring_flush(struct file *file, void *data)
8839 {
8840 	io_uring_attempt_task_drop(file);
8841 	return 0;
8842 }
8843 
io_uring_validate_mmap_request(struct file * file,loff_t pgoff,size_t sz)8844 static void *io_uring_validate_mmap_request(struct file *file,
8845 					    loff_t pgoff, size_t sz)
8846 {
8847 	struct io_ring_ctx *ctx = file->private_data;
8848 	loff_t offset = pgoff << PAGE_SHIFT;
8849 	struct page *page;
8850 	void *ptr;
8851 
8852 	switch (offset) {
8853 	case IORING_OFF_SQ_RING:
8854 	case IORING_OFF_CQ_RING:
8855 		ptr = ctx->rings;
8856 		break;
8857 	case IORING_OFF_SQES:
8858 		ptr = ctx->sq_sqes;
8859 		break;
8860 	default:
8861 		return ERR_PTR(-EINVAL);
8862 	}
8863 
8864 	page = virt_to_head_page(ptr);
8865 	if (sz > page_size(page))
8866 		return ERR_PTR(-EINVAL);
8867 
8868 	return ptr;
8869 }
8870 
8871 #ifdef CONFIG_MMU
8872 
io_uring_mmap(struct file * file,struct vm_area_struct * vma)8873 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8874 {
8875 	size_t sz = vma->vm_end - vma->vm_start;
8876 	unsigned long pfn;
8877 	void *ptr;
8878 
8879 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8880 	if (IS_ERR(ptr))
8881 		return PTR_ERR(ptr);
8882 
8883 	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8884 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8885 }
8886 
8887 #else /* !CONFIG_MMU */
8888 
io_uring_mmap(struct file * file,struct vm_area_struct * vma)8889 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8890 {
8891 	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8892 }
8893 
io_uring_nommu_mmap_capabilities(struct file * file)8894 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8895 {
8896 	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8897 }
8898 
io_uring_nommu_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)8899 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8900 	unsigned long addr, unsigned long len,
8901 	unsigned long pgoff, unsigned long flags)
8902 {
8903 	void *ptr;
8904 
8905 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
8906 	if (IS_ERR(ptr))
8907 		return PTR_ERR(ptr);
8908 
8909 	return (unsigned long) ptr;
8910 }
8911 
8912 #endif /* !CONFIG_MMU */
8913 
io_sqpoll_wait_sq(struct io_ring_ctx * ctx)8914 static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
8915 {
8916 	DEFINE_WAIT(wait);
8917 
8918 	do {
8919 		if (!io_sqring_full(ctx))
8920 			break;
8921 
8922 		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
8923 
8924 		if (!io_sqring_full(ctx))
8925 			break;
8926 
8927 		schedule();
8928 	} while (!signal_pending(current));
8929 
8930 	finish_wait(&ctx->sqo_sq_wait, &wait);
8931 }
8932 
SYSCALL_DEFINE6(io_uring_enter,unsigned int,fd,u32,to_submit,u32,min_complete,u32,flags,const sigset_t __user *,sig,size_t,sigsz)8933 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
8934 		u32, min_complete, u32, flags, const sigset_t __user *, sig,
8935 		size_t, sigsz)
8936 {
8937 	struct io_ring_ctx *ctx;
8938 	long ret = -EBADF;
8939 	int submitted = 0;
8940 	struct fd f;
8941 
8942 	io_run_task_work();
8943 
8944 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
8945 			IORING_ENTER_SQ_WAIT))
8946 		return -EINVAL;
8947 
8948 	f = fdget(fd);
8949 	if (!f.file)
8950 		return -EBADF;
8951 
8952 	ret = -EOPNOTSUPP;
8953 	if (f.file->f_op != &io_uring_fops)
8954 		goto out_fput;
8955 
8956 	ret = -ENXIO;
8957 	ctx = f.file->private_data;
8958 	if (!percpu_ref_tryget(&ctx->refs))
8959 		goto out_fput;
8960 
8961 	ret = -EBADFD;
8962 	if (ctx->flags & IORING_SETUP_R_DISABLED)
8963 		goto out;
8964 
8965 	/*
8966 	 * For SQ polling, the thread will do all submissions and completions.
8967 	 * Just return the requested submit count, and wake the thread if
8968 	 * we were asked to.
8969 	 */
8970 	ret = 0;
8971 	if (ctx->flags & IORING_SETUP_SQPOLL) {
8972 		if (!list_empty_careful(&ctx->cq_overflow_list))
8973 			io_cqring_overflow_flush(ctx, false, NULL, NULL);
8974 		if (flags & IORING_ENTER_SQ_WAKEUP)
8975 			wake_up(&ctx->sq_data->wait);
8976 		if (flags & IORING_ENTER_SQ_WAIT)
8977 			io_sqpoll_wait_sq(ctx);
8978 		submitted = to_submit;
8979 	} else if (to_submit) {
8980 		ret = io_uring_add_task_file(ctx, f.file);
8981 		if (unlikely(ret))
8982 			goto out;
8983 		mutex_lock(&ctx->uring_lock);
8984 		submitted = io_submit_sqes(ctx, to_submit);
8985 		mutex_unlock(&ctx->uring_lock);
8986 
8987 		if (submitted != to_submit)
8988 			goto out;
8989 	}
8990 	if (flags & IORING_ENTER_GETEVENTS) {
8991 		min_complete = min(min_complete, ctx->cq_entries);
8992 
8993 		/*
8994 		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
8995 		 * space applications don't need to do io completion events
8996 		 * polling again, they can rely on io_sq_thread to do polling
8997 		 * work, which can reduce cpu usage and uring_lock contention.
8998 		 */
8999 		if (ctx->flags & IORING_SETUP_IOPOLL &&
9000 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
9001 			ret = io_iopoll_check(ctx, min_complete);
9002 		} else {
9003 			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
9004 		}
9005 	}
9006 
9007 out:
9008 	percpu_ref_put(&ctx->refs);
9009 out_fput:
9010 	fdput(f);
9011 	return submitted ? submitted : ret;
9012 }
9013 
9014 #ifdef CONFIG_PROC_FS
io_uring_show_cred(int id,void * p,void * data)9015 static int io_uring_show_cred(int id, void *p, void *data)
9016 {
9017 	struct io_identity *iod = p;
9018 	const struct cred *cred = iod->creds;
9019 	struct seq_file *m = data;
9020 	struct user_namespace *uns = seq_user_ns(m);
9021 	struct group_info *gi;
9022 	kernel_cap_t cap;
9023 	unsigned __capi;
9024 	int g;
9025 
9026 	seq_printf(m, "%5d\n", id);
9027 	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9028 	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9029 	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9030 	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9031 	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9032 	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9033 	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9034 	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9035 	seq_puts(m, "\n\tGroups:\t");
9036 	gi = cred->group_info;
9037 	for (g = 0; g < gi->ngroups; g++) {
9038 		seq_put_decimal_ull(m, g ? " " : "",
9039 					from_kgid_munged(uns, gi->gid[g]));
9040 	}
9041 	seq_puts(m, "\n\tCapEff:\t");
9042 	cap = cred->cap_effective;
9043 	CAP_FOR_EACH_U32(__capi)
9044 		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9045 	seq_putc(m, '\n');
9046 	return 0;
9047 }
9048 
__io_uring_show_fdinfo(struct io_ring_ctx * ctx,struct seq_file * m)9049 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9050 {
9051 	struct io_sq_data *sq = NULL;
9052 	bool has_lock;
9053 	int i;
9054 
9055 	/*
9056 	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9057 	 * since fdinfo case grabs it in the opposite direction of normal use
9058 	 * cases. If we fail to get the lock, we just don't iterate any
9059 	 * structures that could be going away outside the io_uring mutex.
9060 	 */
9061 	has_lock = mutex_trylock(&ctx->uring_lock);
9062 
9063 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
9064 		sq = ctx->sq_data;
9065 
9066 	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9067 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9068 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9069 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9070 		struct fixed_file_table *table;
9071 		struct file *f;
9072 
9073 		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
9074 		f = table->files[i & IORING_FILE_TABLE_MASK];
9075 		if (f)
9076 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9077 		else
9078 			seq_printf(m, "%5u: <none>\n", i);
9079 	}
9080 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9081 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9082 		struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9083 
9084 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9085 						(unsigned int) buf->len);
9086 	}
9087 	if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9088 		seq_printf(m, "Personalities:\n");
9089 		idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9090 	}
9091 	seq_printf(m, "PollList:\n");
9092 	spin_lock_irq(&ctx->completion_lock);
9093 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9094 		struct hlist_head *list = &ctx->cancel_hash[i];
9095 		struct io_kiocb *req;
9096 
9097 		hlist_for_each_entry(req, list, hash_node)
9098 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9099 					req->task->task_works != NULL);
9100 	}
9101 	spin_unlock_irq(&ctx->completion_lock);
9102 	if (has_lock)
9103 		mutex_unlock(&ctx->uring_lock);
9104 }
9105 
io_uring_show_fdinfo(struct seq_file * m,struct file * f)9106 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9107 {
9108 	struct io_ring_ctx *ctx = f->private_data;
9109 
9110 	if (percpu_ref_tryget(&ctx->refs)) {
9111 		__io_uring_show_fdinfo(ctx, m);
9112 		percpu_ref_put(&ctx->refs);
9113 	}
9114 }
9115 #endif
9116 
9117 static const struct file_operations io_uring_fops = {
9118 	.release	= io_uring_release,
9119 	.flush		= io_uring_flush,
9120 	.mmap		= io_uring_mmap,
9121 #ifndef CONFIG_MMU
9122 	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
9123 	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
9124 #endif
9125 	.poll		= io_uring_poll,
9126 	.fasync		= io_uring_fasync,
9127 #ifdef CONFIG_PROC_FS
9128 	.show_fdinfo	= io_uring_show_fdinfo,
9129 #endif
9130 };
9131 
io_allocate_scq_urings(struct io_ring_ctx * ctx,struct io_uring_params * p)9132 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9133 				  struct io_uring_params *p)
9134 {
9135 	struct io_rings *rings;
9136 	size_t size, sq_array_offset;
9137 
9138 	/* make sure these are sane, as we already accounted them */
9139 	ctx->sq_entries = p->sq_entries;
9140 	ctx->cq_entries = p->cq_entries;
9141 
9142 	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9143 	if (size == SIZE_MAX)
9144 		return -EOVERFLOW;
9145 
9146 	rings = io_mem_alloc(size);
9147 	if (!rings)
9148 		return -ENOMEM;
9149 
9150 	ctx->rings = rings;
9151 	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9152 	rings->sq_ring_mask = p->sq_entries - 1;
9153 	rings->cq_ring_mask = p->cq_entries - 1;
9154 	rings->sq_ring_entries = p->sq_entries;
9155 	rings->cq_ring_entries = p->cq_entries;
9156 	ctx->sq_mask = rings->sq_ring_mask;
9157 	ctx->cq_mask = rings->cq_ring_mask;
9158 
9159 	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9160 	if (size == SIZE_MAX) {
9161 		io_mem_free(ctx->rings);
9162 		ctx->rings = NULL;
9163 		return -EOVERFLOW;
9164 	}
9165 
9166 	ctx->sq_sqes = io_mem_alloc(size);
9167 	if (!ctx->sq_sqes) {
9168 		io_mem_free(ctx->rings);
9169 		ctx->rings = NULL;
9170 		return -ENOMEM;
9171 	}
9172 
9173 	return 0;
9174 }
9175 
9176 /*
9177  * Allocate an anonymous fd, this is what constitutes the application
9178  * visible backing of an io_uring instance. The application mmaps this
9179  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9180  * we have to tie this fd to a socket for file garbage collection purposes.
9181  */
io_uring_get_fd(struct io_ring_ctx * ctx)9182 static int io_uring_get_fd(struct io_ring_ctx *ctx)
9183 {
9184 	struct file *file;
9185 	int ret;
9186 	int fd;
9187 
9188 #if defined(CONFIG_UNIX)
9189 	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9190 				&ctx->ring_sock);
9191 	if (ret)
9192 		return ret;
9193 #endif
9194 
9195 	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9196 	if (ret < 0)
9197 		goto err;
9198 	fd = ret;
9199 
9200 	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9201 					O_RDWR | O_CLOEXEC);
9202 	if (IS_ERR(file)) {
9203 		put_unused_fd(fd);
9204 		ret = PTR_ERR(file);
9205 		goto err;
9206 	}
9207 
9208 #if defined(CONFIG_UNIX)
9209 	ctx->ring_sock->file = file;
9210 #endif
9211 	ret = io_uring_add_task_file(ctx, file);
9212 	if (ret) {
9213 		fput(file);
9214 		put_unused_fd(fd);
9215 		goto err;
9216 	}
9217 	fd_install(fd, file);
9218 	return fd;
9219 err:
9220 #if defined(CONFIG_UNIX)
9221 	sock_release(ctx->ring_sock);
9222 	ctx->ring_sock = NULL;
9223 #endif
9224 	return ret;
9225 }
9226 
io_uring_create(unsigned entries,struct io_uring_params * p,struct io_uring_params __user * params)9227 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9228 			   struct io_uring_params __user *params)
9229 {
9230 	struct user_struct *user = NULL;
9231 	struct io_ring_ctx *ctx;
9232 	bool limit_mem;
9233 	int ret;
9234 
9235 	if (!entries)
9236 		return -EINVAL;
9237 	if (entries > IORING_MAX_ENTRIES) {
9238 		if (!(p->flags & IORING_SETUP_CLAMP))
9239 			return -EINVAL;
9240 		entries = IORING_MAX_ENTRIES;
9241 	}
9242 
9243 	/*
9244 	 * Use twice as many entries for the CQ ring. It's possible for the
9245 	 * application to drive a higher depth than the size of the SQ ring,
9246 	 * since the sqes are only used at submission time. This allows for
9247 	 * some flexibility in overcommitting a bit. If the application has
9248 	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9249 	 * of CQ ring entries manually.
9250 	 */
9251 	p->sq_entries = roundup_pow_of_two(entries);
9252 	if (p->flags & IORING_SETUP_CQSIZE) {
9253 		/*
9254 		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9255 		 * to a power-of-two, if it isn't already. We do NOT impose
9256 		 * any cq vs sq ring sizing.
9257 		 */
9258 		if (!p->cq_entries)
9259 			return -EINVAL;
9260 		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9261 			if (!(p->flags & IORING_SETUP_CLAMP))
9262 				return -EINVAL;
9263 			p->cq_entries = IORING_MAX_CQ_ENTRIES;
9264 		}
9265 		p->cq_entries = roundup_pow_of_two(p->cq_entries);
9266 		if (p->cq_entries < p->sq_entries)
9267 			return -EINVAL;
9268 	} else {
9269 		p->cq_entries = 2 * p->sq_entries;
9270 	}
9271 
9272 	user = get_uid(current_user());
9273 	limit_mem = !capable(CAP_IPC_LOCK);
9274 
9275 	if (limit_mem) {
9276 		ret = __io_account_mem(user,
9277 				ring_pages(p->sq_entries, p->cq_entries));
9278 		if (ret) {
9279 			free_uid(user);
9280 			return ret;
9281 		}
9282 	}
9283 
9284 	ctx = io_ring_ctx_alloc(p);
9285 	if (!ctx) {
9286 		if (limit_mem)
9287 			__io_unaccount_mem(user, ring_pages(p->sq_entries,
9288 								p->cq_entries));
9289 		free_uid(user);
9290 		return -ENOMEM;
9291 	}
9292 	ctx->compat = in_compat_syscall();
9293 	ctx->user = user;
9294 	ctx->creds = get_current_cred();
9295 #ifdef CONFIG_AUDIT
9296 	ctx->loginuid = current->loginuid;
9297 	ctx->sessionid = current->sessionid;
9298 #endif
9299 	ctx->sqo_task = get_task_struct(current);
9300 
9301 	/*
9302 	 * This is just grabbed for accounting purposes. When a process exits,
9303 	 * the mm is exited and dropped before the files, hence we need to hang
9304 	 * on to this mm purely for the purposes of being able to unaccount
9305 	 * memory (locked/pinned vm). It's not used for anything else.
9306 	 */
9307 	mmgrab(current->mm);
9308 	ctx->mm_account = current->mm;
9309 
9310 #ifdef CONFIG_BLK_CGROUP
9311 	/*
9312 	 * The sq thread will belong to the original cgroup it was inited in.
9313 	 * If the cgroup goes offline (e.g. disabling the io controller), then
9314 	 * issued bios will be associated with the closest cgroup later in the
9315 	 * block layer.
9316 	 */
9317 	rcu_read_lock();
9318 	ctx->sqo_blkcg_css = blkcg_css();
9319 	ret = css_tryget_online(ctx->sqo_blkcg_css);
9320 	rcu_read_unlock();
9321 	if (!ret) {
9322 		/* don't init against a dying cgroup, have the user try again */
9323 		ctx->sqo_blkcg_css = NULL;
9324 		ret = -ENODEV;
9325 		goto err;
9326 	}
9327 #endif
9328 
9329 	/*
9330 	 * Account memory _before_ installing the file descriptor. Once
9331 	 * the descriptor is installed, it can get closed at any time. Also
9332 	 * do this before hitting the general error path, as ring freeing
9333 	 * will un-account as well.
9334 	 */
9335 	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
9336 		       ACCT_LOCKED);
9337 	ctx->limit_mem = limit_mem;
9338 
9339 	ret = io_allocate_scq_urings(ctx, p);
9340 	if (ret)
9341 		goto err;
9342 
9343 	ret = io_sq_offload_create(ctx, p);
9344 	if (ret)
9345 		goto err;
9346 
9347 	if (!(p->flags & IORING_SETUP_R_DISABLED))
9348 		io_sq_offload_start(ctx);
9349 
9350 	memset(&p->sq_off, 0, sizeof(p->sq_off));
9351 	p->sq_off.head = offsetof(struct io_rings, sq.head);
9352 	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9353 	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9354 	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9355 	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9356 	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9357 	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9358 
9359 	memset(&p->cq_off, 0, sizeof(p->cq_off));
9360 	p->cq_off.head = offsetof(struct io_rings, cq.head);
9361 	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9362 	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9363 	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9364 	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9365 	p->cq_off.cqes = offsetof(struct io_rings, cqes);
9366 	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9367 
9368 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9369 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9370 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9371 			IORING_FEAT_POLL_32BITS;
9372 
9373 	if (copy_to_user(params, p, sizeof(*p))) {
9374 		ret = -EFAULT;
9375 		goto err;
9376 	}
9377 
9378 	/*
9379 	 * Install ring fd as the very last thing, so we don't risk someone
9380 	 * having closed it before we finish setup
9381 	 */
9382 	ret = io_uring_get_fd(ctx);
9383 	if (ret < 0)
9384 		goto err;
9385 
9386 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9387 	return ret;
9388 err:
9389 	io_ring_ctx_wait_and_kill(ctx);
9390 	return ret;
9391 }
9392 
9393 /*
9394  * Sets up an aio uring context, and returns the fd. Applications asks for a
9395  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9396  * params structure passed in.
9397  */
io_uring_setup(u32 entries,struct io_uring_params __user * params)9398 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9399 {
9400 	struct io_uring_params p;
9401 	int i;
9402 
9403 	if (copy_from_user(&p, params, sizeof(p)))
9404 		return -EFAULT;
9405 	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9406 		if (p.resv[i])
9407 			return -EINVAL;
9408 	}
9409 
9410 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9411 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9412 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9413 			IORING_SETUP_R_DISABLED))
9414 		return -EINVAL;
9415 
9416 	return  io_uring_create(entries, &p, params);
9417 }
9418 
SYSCALL_DEFINE2(io_uring_setup,u32,entries,struct io_uring_params __user *,params)9419 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9420 		struct io_uring_params __user *, params)
9421 {
9422 	return io_uring_setup(entries, params);
9423 }
9424 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)9425 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9426 {
9427 	struct io_uring_probe *p;
9428 	size_t size;
9429 	int i, ret;
9430 
9431 	size = struct_size(p, ops, nr_args);
9432 	if (size == SIZE_MAX)
9433 		return -EOVERFLOW;
9434 	p = kzalloc(size, GFP_KERNEL);
9435 	if (!p)
9436 		return -ENOMEM;
9437 
9438 	ret = -EFAULT;
9439 	if (copy_from_user(p, arg, size))
9440 		goto out;
9441 	ret = -EINVAL;
9442 	if (memchr_inv(p, 0, size))
9443 		goto out;
9444 
9445 	p->last_op = IORING_OP_LAST - 1;
9446 	if (nr_args > IORING_OP_LAST)
9447 		nr_args = IORING_OP_LAST;
9448 
9449 	for (i = 0; i < nr_args; i++) {
9450 		p->ops[i].op = i;
9451 		if (!io_op_defs[i].not_supported)
9452 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
9453 	}
9454 	p->ops_len = i;
9455 
9456 	ret = 0;
9457 	if (copy_to_user(arg, p, size))
9458 		ret = -EFAULT;
9459 out:
9460 	kfree(p);
9461 	return ret;
9462 }
9463 
io_register_personality(struct io_ring_ctx * ctx)9464 static int io_register_personality(struct io_ring_ctx *ctx)
9465 {
9466 	struct io_identity *id;
9467 	int ret;
9468 
9469 	id = kmalloc(sizeof(*id), GFP_KERNEL);
9470 	if (unlikely(!id))
9471 		return -ENOMEM;
9472 
9473 	io_init_identity(id);
9474 	id->creds = get_current_cred();
9475 
9476 	ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
9477 	if (ret < 0) {
9478 		put_cred(id->creds);
9479 		kfree(id);
9480 	}
9481 	return ret;
9482 }
9483 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)9484 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9485 {
9486 	struct io_identity *iod;
9487 
9488 	iod = idr_remove(&ctx->personality_idr, id);
9489 	if (iod) {
9490 		put_cred(iod->creds);
9491 		if (refcount_dec_and_test(&iod->count))
9492 			kfree(iod);
9493 		return 0;
9494 	}
9495 
9496 	return -EINVAL;
9497 }
9498 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)9499 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9500 				    unsigned int nr_args)
9501 {
9502 	struct io_uring_restriction *res;
9503 	size_t size;
9504 	int i, ret;
9505 
9506 	/* Restrictions allowed only if rings started disabled */
9507 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9508 		return -EBADFD;
9509 
9510 	/* We allow only a single restrictions registration */
9511 	if (ctx->restrictions.registered)
9512 		return -EBUSY;
9513 
9514 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9515 		return -EINVAL;
9516 
9517 	size = array_size(nr_args, sizeof(*res));
9518 	if (size == SIZE_MAX)
9519 		return -EOVERFLOW;
9520 
9521 	res = memdup_user(arg, size);
9522 	if (IS_ERR(res))
9523 		return PTR_ERR(res);
9524 
9525 	ret = 0;
9526 
9527 	for (i = 0; i < nr_args; i++) {
9528 		switch (res[i].opcode) {
9529 		case IORING_RESTRICTION_REGISTER_OP:
9530 			if (res[i].register_op >= IORING_REGISTER_LAST) {
9531 				ret = -EINVAL;
9532 				goto out;
9533 			}
9534 
9535 			__set_bit(res[i].register_op,
9536 				  ctx->restrictions.register_op);
9537 			break;
9538 		case IORING_RESTRICTION_SQE_OP:
9539 			if (res[i].sqe_op >= IORING_OP_LAST) {
9540 				ret = -EINVAL;
9541 				goto out;
9542 			}
9543 
9544 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9545 			break;
9546 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9547 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9548 			break;
9549 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9550 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9551 			break;
9552 		default:
9553 			ret = -EINVAL;
9554 			goto out;
9555 		}
9556 	}
9557 
9558 out:
9559 	/* Reset all restrictions if an error happened */
9560 	if (ret != 0)
9561 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9562 	else
9563 		ctx->restrictions.registered = true;
9564 
9565 	kfree(res);
9566 	return ret;
9567 }
9568 
io_register_enable_rings(struct io_ring_ctx * ctx)9569 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9570 {
9571 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9572 		return -EBADFD;
9573 
9574 	if (ctx->restrictions.registered)
9575 		ctx->restricted = 1;
9576 
9577 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
9578 
9579 	io_sq_offload_start(ctx);
9580 
9581 	return 0;
9582 }
9583 
io_register_op_must_quiesce(int op)9584 static bool io_register_op_must_quiesce(int op)
9585 {
9586 	switch (op) {
9587 	case IORING_UNREGISTER_FILES:
9588 	case IORING_REGISTER_FILES_UPDATE:
9589 	case IORING_REGISTER_PROBE:
9590 	case IORING_REGISTER_PERSONALITY:
9591 	case IORING_UNREGISTER_PERSONALITY:
9592 		return false;
9593 	default:
9594 		return true;
9595 	}
9596 }
9597 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)9598 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9599 			       void __user *arg, unsigned nr_args)
9600 	__releases(ctx->uring_lock)
9601 	__acquires(ctx->uring_lock)
9602 {
9603 	int ret;
9604 
9605 	/*
9606 	 * We're inside the ring mutex, if the ref is already dying, then
9607 	 * someone else killed the ctx or is already going through
9608 	 * io_uring_register().
9609 	 */
9610 	if (percpu_ref_is_dying(&ctx->refs))
9611 		return -ENXIO;
9612 
9613 	if (io_register_op_must_quiesce(opcode)) {
9614 		percpu_ref_kill(&ctx->refs);
9615 
9616 		/*
9617 		 * Drop uring mutex before waiting for references to exit. If
9618 		 * another thread is currently inside io_uring_enter() it might
9619 		 * need to grab the uring_lock to make progress. If we hold it
9620 		 * here across the drain wait, then we can deadlock. It's safe
9621 		 * to drop the mutex here, since no new references will come in
9622 		 * after we've killed the percpu ref.
9623 		 */
9624 		mutex_unlock(&ctx->uring_lock);
9625 		do {
9626 			ret = wait_for_completion_interruptible(&ctx->ref_comp);
9627 			if (!ret)
9628 				break;
9629 			ret = io_run_task_work_sig();
9630 			if (ret < 0)
9631 				break;
9632 		} while (1);
9633 
9634 		mutex_lock(&ctx->uring_lock);
9635 
9636 		if (ret) {
9637 			percpu_ref_resurrect(&ctx->refs);
9638 			goto out_quiesce;
9639 		}
9640 	}
9641 
9642 	if (ctx->restricted) {
9643 		if (opcode >= IORING_REGISTER_LAST) {
9644 			ret = -EINVAL;
9645 			goto out;
9646 		}
9647 
9648 		if (!test_bit(opcode, ctx->restrictions.register_op)) {
9649 			ret = -EACCES;
9650 			goto out;
9651 		}
9652 	}
9653 
9654 	switch (opcode) {
9655 	case IORING_REGISTER_BUFFERS:
9656 		ret = io_sqe_buffer_register(ctx, arg, nr_args);
9657 		break;
9658 	case IORING_UNREGISTER_BUFFERS:
9659 		ret = -EINVAL;
9660 		if (arg || nr_args)
9661 			break;
9662 		ret = io_sqe_buffer_unregister(ctx);
9663 		break;
9664 	case IORING_REGISTER_FILES:
9665 		ret = io_sqe_files_register(ctx, arg, nr_args);
9666 		break;
9667 	case IORING_UNREGISTER_FILES:
9668 		ret = -EINVAL;
9669 		if (arg || nr_args)
9670 			break;
9671 		ret = io_sqe_files_unregister(ctx);
9672 		break;
9673 	case IORING_REGISTER_FILES_UPDATE:
9674 		ret = io_sqe_files_update(ctx, arg, nr_args);
9675 		break;
9676 	case IORING_REGISTER_EVENTFD:
9677 	case IORING_REGISTER_EVENTFD_ASYNC:
9678 		ret = -EINVAL;
9679 		if (nr_args != 1)
9680 			break;
9681 		ret = io_eventfd_register(ctx, arg);
9682 		if (ret)
9683 			break;
9684 		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9685 			ctx->eventfd_async = 1;
9686 		else
9687 			ctx->eventfd_async = 0;
9688 		break;
9689 	case IORING_UNREGISTER_EVENTFD:
9690 		ret = -EINVAL;
9691 		if (arg || nr_args)
9692 			break;
9693 		ret = io_eventfd_unregister(ctx);
9694 		break;
9695 	case IORING_REGISTER_PROBE:
9696 		ret = -EINVAL;
9697 		if (!arg || nr_args > 256)
9698 			break;
9699 		ret = io_probe(ctx, arg, nr_args);
9700 		break;
9701 	case IORING_REGISTER_PERSONALITY:
9702 		ret = -EINVAL;
9703 		if (arg || nr_args)
9704 			break;
9705 		ret = io_register_personality(ctx);
9706 		break;
9707 	case IORING_UNREGISTER_PERSONALITY:
9708 		ret = -EINVAL;
9709 		if (arg)
9710 			break;
9711 		ret = io_unregister_personality(ctx, nr_args);
9712 		break;
9713 	case IORING_REGISTER_ENABLE_RINGS:
9714 		ret = -EINVAL;
9715 		if (arg || nr_args)
9716 			break;
9717 		ret = io_register_enable_rings(ctx);
9718 		break;
9719 	case IORING_REGISTER_RESTRICTIONS:
9720 		ret = io_register_restrictions(ctx, arg, nr_args);
9721 		break;
9722 	default:
9723 		ret = -EINVAL;
9724 		break;
9725 	}
9726 
9727 out:
9728 	if (io_register_op_must_quiesce(opcode)) {
9729 		/* bring the ctx back to life */
9730 		percpu_ref_reinit(&ctx->refs);
9731 out_quiesce:
9732 		reinit_completion(&ctx->ref_comp);
9733 	}
9734 	return ret;
9735 }
9736 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)9737 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9738 		void __user *, arg, unsigned int, nr_args)
9739 {
9740 	struct io_ring_ctx *ctx;
9741 	long ret = -EBADF;
9742 	struct fd f;
9743 
9744 	f = fdget(fd);
9745 	if (!f.file)
9746 		return -EBADF;
9747 
9748 	ret = -EOPNOTSUPP;
9749 	if (f.file->f_op != &io_uring_fops)
9750 		goto out_fput;
9751 
9752 	ctx = f.file->private_data;
9753 
9754 	mutex_lock(&ctx->uring_lock);
9755 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
9756 	mutex_unlock(&ctx->uring_lock);
9757 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9758 							ctx->cq_ev_fd != NULL, ret);
9759 out_fput:
9760 	fdput(f);
9761 	return ret;
9762 }
9763 
io_uring_init(void)9764 static int __init io_uring_init(void)
9765 {
9766 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9767 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9768 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9769 } while (0)
9770 
9771 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9772 	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9773 	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9774 	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9775 	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9776 	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9777 	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9778 	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9779 	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9780 	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9781 	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9782 	BUILD_BUG_SQE_ELEM(24, __u32,  len);
9783 	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9784 	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9785 	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9786 	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9787 	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9788 	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9789 	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9790 	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9791 	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9792 	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9793 	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9794 	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9795 	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9796 	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9797 	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9798 	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9799 	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9800 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9801 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9802 
9803 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9804 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9805 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
9806 	return 0;
9807 };
9808 __initcall(io_uring_init);
9809