1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/read_write.c
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  */
7 
8 #include <linux/slab.h>
9 #include <linux/stat.h>
10 #include <linux/sched/xacct.h>
11 #include <linux/fcntl.h>
12 #include <linux/file.h>
13 #include <linux/uio.h>
14 #include <linux/fsnotify.h>
15 #include <linux/security.h>
16 #include <linux/export.h>
17 #include <linux/syscalls.h>
18 #include <linux/pagemap.h>
19 #include <linux/splice.h>
20 #include <linux/compat.h>
21 #include <linux/mount.h>
22 #include <linux/fs.h>
23 #include "internal.h"
24 
25 #include <linux/uaccess.h>
26 #include <asm/unistd.h>
27 
28 const struct file_operations generic_ro_fops = {
29 	.llseek		= generic_file_llseek,
30 	.read_iter	= generic_file_read_iter,
31 	.mmap		= generic_file_readonly_mmap,
32 	.splice_read	= generic_file_splice_read,
33 };
34 
35 EXPORT_SYMBOL(generic_ro_fops);
36 
unsigned_offsets(struct file * file)37 static inline bool unsigned_offsets(struct file *file)
38 {
39 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
40 }
41 
42 /**
43  * vfs_setpos - update the file offset for lseek
44  * @file:	file structure in question
45  * @offset:	file offset to seek to
46  * @maxsize:	maximum file size
47  *
48  * This is a low-level filesystem helper for updating the file offset to
49  * the value specified by @offset if the given offset is valid and it is
50  * not equal to the current file offset.
51  *
52  * Return the specified offset on success and -EINVAL on invalid offset.
53  */
vfs_setpos(struct file * file,loff_t offset,loff_t maxsize)54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55 {
56 	if (offset < 0 && !unsigned_offsets(file))
57 		return -EINVAL;
58 	if (offset > maxsize)
59 		return -EINVAL;
60 
61 	if (offset != file->f_pos) {
62 		file->f_pos = offset;
63 		file->f_version = 0;
64 	}
65 	return offset;
66 }
67 EXPORT_SYMBOL(vfs_setpos);
68 
69 /**
70  * generic_file_llseek_size - generic llseek implementation for regular files
71  * @file:	file structure to seek on
72  * @offset:	file offset to seek to
73  * @whence:	type of seek
74  * @size:	max size of this file in file system
75  * @eof:	offset used for SEEK_END position
76  *
77  * This is a variant of generic_file_llseek that allows passing in a custom
78  * maximum file size and a custom EOF position, for e.g. hashed directories
79  *
80  * Synchronization:
81  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83  * read/writes behave like SEEK_SET against seeks.
84  */
85 loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)86 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87 		loff_t maxsize, loff_t eof)
88 {
89 	switch (whence) {
90 	case SEEK_END:
91 		offset += eof;
92 		break;
93 	case SEEK_CUR:
94 		/*
95 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 		 * position-querying operation.  Avoid rewriting the "same"
97 		 * f_pos value back to the file because a concurrent read(),
98 		 * write() or lseek() might have altered it
99 		 */
100 		if (offset == 0)
101 			return file->f_pos;
102 		/*
103 		 * f_lock protects against read/modify/write race with other
104 		 * SEEK_CURs. Note that parallel writes and reads behave
105 		 * like SEEK_SET.
106 		 */
107 		spin_lock(&file->f_lock);
108 		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109 		spin_unlock(&file->f_lock);
110 		return offset;
111 	case SEEK_DATA:
112 		/*
113 		 * In the generic case the entire file is data, so as long as
114 		 * offset isn't at the end of the file then the offset is data.
115 		 */
116 		if ((unsigned long long)offset >= eof)
117 			return -ENXIO;
118 		break;
119 	case SEEK_HOLE:
120 		/*
121 		 * There is a virtual hole at the end of the file, so as long as
122 		 * offset isn't i_size or larger, return i_size.
123 		 */
124 		if ((unsigned long long)offset >= eof)
125 			return -ENXIO;
126 		offset = eof;
127 		break;
128 	}
129 
130 	return vfs_setpos(file, offset, maxsize);
131 }
132 EXPORT_SYMBOL(generic_file_llseek_size);
133 
134 /**
135  * generic_file_llseek - generic llseek implementation for regular files
136  * @file:	file structure to seek on
137  * @offset:	file offset to seek to
138  * @whence:	type of seek
139  *
140  * This is a generic implemenation of ->llseek useable for all normal local
141  * filesystems.  It just updates the file offset to the value specified by
142  * @offset and @whence.
143  */
generic_file_llseek(struct file * file,loff_t offset,int whence)144 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145 {
146 	struct inode *inode = file->f_mapping->host;
147 
148 	return generic_file_llseek_size(file, offset, whence,
149 					inode->i_sb->s_maxbytes,
150 					i_size_read(inode));
151 }
152 EXPORT_SYMBOL(generic_file_llseek);
153 
154 /**
155  * fixed_size_llseek - llseek implementation for fixed-sized devices
156  * @file:	file structure to seek on
157  * @offset:	file offset to seek to
158  * @whence:	type of seek
159  * @size:	size of the file
160  *
161  */
fixed_size_llseek(struct file * file,loff_t offset,int whence,loff_t size)162 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163 {
164 	switch (whence) {
165 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 		return generic_file_llseek_size(file, offset, whence,
167 						size, size);
168 	default:
169 		return -EINVAL;
170 	}
171 }
172 EXPORT_SYMBOL(fixed_size_llseek);
173 
174 /**
175  * no_seek_end_llseek - llseek implementation for fixed-sized devices
176  * @file:	file structure to seek on
177  * @offset:	file offset to seek to
178  * @whence:	type of seek
179  *
180  */
no_seek_end_llseek(struct file * file,loff_t offset,int whence)181 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182 {
183 	switch (whence) {
184 	case SEEK_SET: case SEEK_CUR:
185 		return generic_file_llseek_size(file, offset, whence,
186 						OFFSET_MAX, 0);
187 	default:
188 		return -EINVAL;
189 	}
190 }
191 EXPORT_SYMBOL(no_seek_end_llseek);
192 
193 /**
194  * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195  * @file:	file structure to seek on
196  * @offset:	file offset to seek to
197  * @whence:	type of seek
198  * @size:	maximal offset allowed
199  *
200  */
no_seek_end_llseek_size(struct file * file,loff_t offset,int whence,loff_t size)201 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202 {
203 	switch (whence) {
204 	case SEEK_SET: case SEEK_CUR:
205 		return generic_file_llseek_size(file, offset, whence,
206 						size, 0);
207 	default:
208 		return -EINVAL;
209 	}
210 }
211 EXPORT_SYMBOL(no_seek_end_llseek_size);
212 
213 /**
214  * noop_llseek - No Operation Performed llseek implementation
215  * @file:	file structure to seek on
216  * @offset:	file offset to seek to
217  * @whence:	type of seek
218  *
219  * This is an implementation of ->llseek useable for the rare special case when
220  * userspace expects the seek to succeed but the (device) file is actually not
221  * able to perform the seek. In this case you use noop_llseek() instead of
222  * falling back to the default implementation of ->llseek.
223  */
noop_llseek(struct file * file,loff_t offset,int whence)224 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225 {
226 	return file->f_pos;
227 }
228 EXPORT_SYMBOL(noop_llseek);
229 
no_llseek(struct file * file,loff_t offset,int whence)230 loff_t no_llseek(struct file *file, loff_t offset, int whence)
231 {
232 	return -ESPIPE;
233 }
234 EXPORT_SYMBOL(no_llseek);
235 
default_llseek(struct file * file,loff_t offset,int whence)236 loff_t default_llseek(struct file *file, loff_t offset, int whence)
237 {
238 	struct inode *inode = file_inode(file);
239 	loff_t retval;
240 
241 	inode_lock(inode);
242 	switch (whence) {
243 		case SEEK_END:
244 			offset += i_size_read(inode);
245 			break;
246 		case SEEK_CUR:
247 			if (offset == 0) {
248 				retval = file->f_pos;
249 				goto out;
250 			}
251 			offset += file->f_pos;
252 			break;
253 		case SEEK_DATA:
254 			/*
255 			 * In the generic case the entire file is data, so as
256 			 * long as offset isn't at the end of the file then the
257 			 * offset is data.
258 			 */
259 			if (offset >= inode->i_size) {
260 				retval = -ENXIO;
261 				goto out;
262 			}
263 			break;
264 		case SEEK_HOLE:
265 			/*
266 			 * There is a virtual hole at the end of the file, so
267 			 * as long as offset isn't i_size or larger, return
268 			 * i_size.
269 			 */
270 			if (offset >= inode->i_size) {
271 				retval = -ENXIO;
272 				goto out;
273 			}
274 			offset = inode->i_size;
275 			break;
276 	}
277 	retval = -EINVAL;
278 	if (offset >= 0 || unsigned_offsets(file)) {
279 		if (offset != file->f_pos) {
280 			file->f_pos = offset;
281 			file->f_version = 0;
282 		}
283 		retval = offset;
284 	}
285 out:
286 	inode_unlock(inode);
287 	return retval;
288 }
289 EXPORT_SYMBOL(default_llseek);
290 
vfs_llseek(struct file * file,loff_t offset,int whence)291 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
292 {
293 	loff_t (*fn)(struct file *, loff_t, int);
294 
295 	fn = no_llseek;
296 	if (file->f_mode & FMODE_LSEEK) {
297 		if (file->f_op->llseek)
298 			fn = file->f_op->llseek;
299 	}
300 	return fn(file, offset, whence);
301 }
302 EXPORT_SYMBOL(vfs_llseek);
303 
ksys_lseek(unsigned int fd,off_t offset,unsigned int whence)304 off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
305 {
306 	off_t retval;
307 	struct fd f = fdget_pos(fd);
308 	if (!f.file)
309 		return -EBADF;
310 
311 	retval = -EINVAL;
312 	if (whence <= SEEK_MAX) {
313 		loff_t res = vfs_llseek(f.file, offset, whence);
314 		retval = res;
315 		if (res != (loff_t)retval)
316 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
317 	}
318 	fdput_pos(f);
319 	return retval;
320 }
321 
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)322 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323 {
324 	return ksys_lseek(fd, offset, whence);
325 }
326 
327 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)328 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329 {
330 	return ksys_lseek(fd, offset, whence);
331 }
332 #endif
333 
334 #ifdef __ARCH_WANT_SYS_LLSEEK
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)335 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
336 		unsigned long, offset_low, loff_t __user *, result,
337 		unsigned int, whence)
338 {
339 	int retval;
340 	struct fd f = fdget_pos(fd);
341 	loff_t offset;
342 
343 	if (!f.file)
344 		return -EBADF;
345 
346 	retval = -EINVAL;
347 	if (whence > SEEK_MAX)
348 		goto out_putf;
349 
350 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
351 			whence);
352 
353 	retval = (int)offset;
354 	if (offset >= 0) {
355 		retval = -EFAULT;
356 		if (!copy_to_user(result, &offset, sizeof(offset)))
357 			retval = 0;
358 	}
359 out_putf:
360 	fdput_pos(f);
361 	return retval;
362 }
363 #endif
364 
rw_verify_area(int read_write,struct file * file,const loff_t * ppos,size_t count)365 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
366 {
367 	struct inode *inode;
368 	loff_t pos;
369 	int retval = -EINVAL;
370 
371 	inode = file_inode(file);
372 	if (unlikely((ssize_t) count < 0))
373 		return retval;
374 	pos = *ppos;
375 	if (unlikely(pos < 0)) {
376 		if (!unsigned_offsets(file))
377 			return retval;
378 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
379 			return -EOVERFLOW;
380 	} else if (unlikely((loff_t) (pos + count) < 0)) {
381 		if (!unsigned_offsets(file))
382 			return retval;
383 	}
384 
385 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
386 		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
387 				read_write == READ ? F_RDLCK : F_WRLCK);
388 		if (retval < 0)
389 			return retval;
390 	}
391 	return security_file_permission(file,
392 				read_write == READ ? MAY_READ : MAY_WRITE);
393 }
394 
new_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)395 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
396 {
397 	struct iovec iov = { .iov_base = buf, .iov_len = len };
398 	struct kiocb kiocb;
399 	struct iov_iter iter;
400 	ssize_t ret;
401 
402 	init_sync_kiocb(&kiocb, filp);
403 	kiocb.ki_pos = *ppos;
404 	iov_iter_init(&iter, READ, &iov, 1, len);
405 
406 	ret = call_read_iter(filp, &kiocb, &iter);
407 	BUG_ON(ret == -EIOCBQUEUED);
408 	*ppos = kiocb.ki_pos;
409 	return ret;
410 }
411 
__vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)412 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
413 		   loff_t *pos)
414 {
415 	if (file->f_op->read)
416 		return file->f_op->read(file, buf, count, pos);
417 	else if (file->f_op->read_iter)
418 		return new_sync_read(file, buf, count, pos);
419 	else
420 		return -EINVAL;
421 }
422 
kernel_read(struct file * file,void * buf,size_t count,loff_t * pos)423 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
424 {
425 	mm_segment_t old_fs;
426 	ssize_t result;
427 
428 	old_fs = get_fs();
429 	set_fs(get_ds());
430 	/* The cast to a user pointer is valid due to the set_fs() */
431 	result = vfs_read(file, (void __user *)buf, count, pos);
432 	set_fs(old_fs);
433 	return result;
434 }
435 EXPORT_SYMBOL(kernel_read);
436 
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)437 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
438 {
439 	ssize_t ret;
440 
441 	if (!(file->f_mode & FMODE_READ))
442 		return -EBADF;
443 	if (!(file->f_mode & FMODE_CAN_READ))
444 		return -EINVAL;
445 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
446 		return -EFAULT;
447 
448 	ret = rw_verify_area(READ, file, pos, count);
449 	if (!ret) {
450 		if (count > MAX_RW_COUNT)
451 			count =  MAX_RW_COUNT;
452 		ret = __vfs_read(file, buf, count, pos);
453 		if (ret > 0) {
454 			fsnotify_access(file);
455 			add_rchar(current, ret);
456 		}
457 		inc_syscr(current);
458 	}
459 
460 	return ret;
461 }
462 
new_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)463 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
464 {
465 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
466 	struct kiocb kiocb;
467 	struct iov_iter iter;
468 	ssize_t ret;
469 
470 	init_sync_kiocb(&kiocb, filp);
471 	kiocb.ki_pos = *ppos;
472 	iov_iter_init(&iter, WRITE, &iov, 1, len);
473 
474 	ret = call_write_iter(filp, &kiocb, &iter);
475 	BUG_ON(ret == -EIOCBQUEUED);
476 	if (ret > 0)
477 		*ppos = kiocb.ki_pos;
478 	return ret;
479 }
480 
__vfs_write(struct file * file,const char __user * p,size_t count,loff_t * pos)481 ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
482 		    loff_t *pos)
483 {
484 	if (file->f_op->write)
485 		return file->f_op->write(file, p, count, pos);
486 	else if (file->f_op->write_iter)
487 		return new_sync_write(file, p, count, pos);
488 	else
489 		return -EINVAL;
490 }
491 
__kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)492 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
493 {
494 	mm_segment_t old_fs;
495 	const char __user *p;
496 	ssize_t ret;
497 
498 	if (!(file->f_mode & FMODE_CAN_WRITE))
499 		return -EINVAL;
500 
501 	old_fs = get_fs();
502 	set_fs(get_ds());
503 	p = (__force const char __user *)buf;
504 	if (count > MAX_RW_COUNT)
505 		count =  MAX_RW_COUNT;
506 	ret = __vfs_write(file, p, count, pos);
507 	set_fs(old_fs);
508 	if (ret > 0) {
509 		fsnotify_modify(file);
510 		add_wchar(current, ret);
511 	}
512 	inc_syscw(current);
513 	return ret;
514 }
515 EXPORT_SYMBOL(__kernel_write);
516 
kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)517 ssize_t kernel_write(struct file *file, const void *buf, size_t count,
518 			    loff_t *pos)
519 {
520 	mm_segment_t old_fs;
521 	ssize_t res;
522 
523 	old_fs = get_fs();
524 	set_fs(get_ds());
525 	/* The cast to a user pointer is valid due to the set_fs() */
526 	res = vfs_write(file, (__force const char __user *)buf, count, pos);
527 	set_fs(old_fs);
528 
529 	return res;
530 }
531 EXPORT_SYMBOL(kernel_write);
532 
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)533 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
534 {
535 	ssize_t ret;
536 
537 	if (!(file->f_mode & FMODE_WRITE))
538 		return -EBADF;
539 	if (!(file->f_mode & FMODE_CAN_WRITE))
540 		return -EINVAL;
541 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
542 		return -EFAULT;
543 
544 	ret = rw_verify_area(WRITE, file, pos, count);
545 	if (!ret) {
546 		if (count > MAX_RW_COUNT)
547 			count =  MAX_RW_COUNT;
548 		file_start_write(file);
549 		ret = __vfs_write(file, buf, count, pos);
550 		if (ret > 0) {
551 			fsnotify_modify(file);
552 			add_wchar(current, ret);
553 		}
554 		inc_syscw(current);
555 		file_end_write(file);
556 	}
557 
558 	return ret;
559 }
560 
file_pos_read(struct file * file)561 static inline loff_t file_pos_read(struct file *file)
562 {
563 	return file->f_pos;
564 }
565 
file_pos_write(struct file * file,loff_t pos)566 static inline void file_pos_write(struct file *file, loff_t pos)
567 {
568 	file->f_pos = pos;
569 }
570 
ksys_read(unsigned int fd,char __user * buf,size_t count)571 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
572 {
573 	struct fd f = fdget_pos(fd);
574 	ssize_t ret = -EBADF;
575 
576 	if (f.file) {
577 		loff_t pos = file_pos_read(f.file);
578 		ret = vfs_read(f.file, buf, count, &pos);
579 		if (ret >= 0)
580 			file_pos_write(f.file, pos);
581 		fdput_pos(f);
582 	}
583 	return ret;
584 }
585 
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)586 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
587 {
588 	return ksys_read(fd, buf, count);
589 }
590 
ksys_write(unsigned int fd,const char __user * buf,size_t count)591 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
592 {
593 	struct fd f = fdget_pos(fd);
594 	ssize_t ret = -EBADF;
595 
596 	if (f.file) {
597 		loff_t pos = file_pos_read(f.file);
598 		ret = vfs_write(f.file, buf, count, &pos);
599 		if (ret >= 0)
600 			file_pos_write(f.file, pos);
601 		fdput_pos(f);
602 	}
603 
604 	return ret;
605 }
606 
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)607 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
608 		size_t, count)
609 {
610 	return ksys_write(fd, buf, count);
611 }
612 
ksys_pread64(unsigned int fd,char __user * buf,size_t count,loff_t pos)613 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
614 		     loff_t pos)
615 {
616 	struct fd f;
617 	ssize_t ret = -EBADF;
618 
619 	if (pos < 0)
620 		return -EINVAL;
621 
622 	f = fdget(fd);
623 	if (f.file) {
624 		ret = -ESPIPE;
625 		if (f.file->f_mode & FMODE_PREAD)
626 			ret = vfs_read(f.file, buf, count, &pos);
627 		fdput(f);
628 	}
629 
630 	return ret;
631 }
632 
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)633 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
634 			size_t, count, loff_t, pos)
635 {
636 	return ksys_pread64(fd, buf, count, pos);
637 }
638 
ksys_pwrite64(unsigned int fd,const char __user * buf,size_t count,loff_t pos)639 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
640 		      size_t count, loff_t pos)
641 {
642 	struct fd f;
643 	ssize_t ret = -EBADF;
644 
645 	if (pos < 0)
646 		return -EINVAL;
647 
648 	f = fdget(fd);
649 	if (f.file) {
650 		ret = -ESPIPE;
651 		if (f.file->f_mode & FMODE_PWRITE)
652 			ret = vfs_write(f.file, buf, count, &pos);
653 		fdput(f);
654 	}
655 
656 	return ret;
657 }
658 
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)659 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
660 			 size_t, count, loff_t, pos)
661 {
662 	return ksys_pwrite64(fd, buf, count, pos);
663 }
664 
do_iter_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)665 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
666 		loff_t *ppos, int type, rwf_t flags)
667 {
668 	struct kiocb kiocb;
669 	ssize_t ret;
670 
671 	init_sync_kiocb(&kiocb, filp);
672 	ret = kiocb_set_rw_flags(&kiocb, flags);
673 	if (ret)
674 		return ret;
675 	kiocb.ki_pos = *ppos;
676 
677 	if (type == READ)
678 		ret = call_read_iter(filp, &kiocb, iter);
679 	else
680 		ret = call_write_iter(filp, &kiocb, iter);
681 	BUG_ON(ret == -EIOCBQUEUED);
682 	*ppos = kiocb.ki_pos;
683 	return ret;
684 }
685 
686 /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)687 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
688 		loff_t *ppos, int type, rwf_t flags)
689 {
690 	ssize_t ret = 0;
691 
692 	if (flags & ~RWF_HIPRI)
693 		return -EOPNOTSUPP;
694 
695 	while (iov_iter_count(iter)) {
696 		struct iovec iovec = iov_iter_iovec(iter);
697 		ssize_t nr;
698 
699 		if (type == READ) {
700 			nr = filp->f_op->read(filp, iovec.iov_base,
701 					      iovec.iov_len, ppos);
702 		} else {
703 			nr = filp->f_op->write(filp, iovec.iov_base,
704 					       iovec.iov_len, ppos);
705 		}
706 
707 		if (nr < 0) {
708 			if (!ret)
709 				ret = nr;
710 			break;
711 		}
712 		ret += nr;
713 		if (nr != iovec.iov_len)
714 			break;
715 		iov_iter_advance(iter, nr);
716 	}
717 
718 	return ret;
719 }
720 
721 /* A write operation does a read from user space and vice versa */
722 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
723 
724 /**
725  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
726  *     into the kernel and check that it is valid.
727  *
728  * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
729  * @uvector: Pointer to the userspace array.
730  * @nr_segs: Number of elements in userspace array.
731  * @fast_segs: Number of elements in @fast_pointer.
732  * @fast_pointer: Pointer to (usually small on-stack) kernel array.
733  * @ret_pointer: (output parameter) Pointer to a variable that will point to
734  *     either @fast_pointer, a newly allocated kernel array, or NULL,
735  *     depending on which array was used.
736  *
737  * This function copies an array of &struct iovec of @nr_segs from
738  * userspace into the kernel and checks that each element is valid (e.g.
739  * it does not point to a kernel address or cause overflow by being too
740  * large, etc.).
741  *
742  * As an optimization, the caller may provide a pointer to a small
743  * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
744  * (the size of this array, or 0 if unused, should be given in @fast_segs).
745  *
746  * @ret_pointer will always point to the array that was used, so the
747  * caller must take care not to call kfree() on it e.g. in case the
748  * @fast_pointer array was used and it was allocated on the stack.
749  *
750  * Return: The total number of bytes covered by the iovec array on success
751  *   or a negative error code on error.
752  */
rw_copy_check_uvector(int type,const struct iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)753 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
754 			      unsigned long nr_segs, unsigned long fast_segs,
755 			      struct iovec *fast_pointer,
756 			      struct iovec **ret_pointer)
757 {
758 	unsigned long seg;
759 	ssize_t ret;
760 	struct iovec *iov = fast_pointer;
761 
762 	/*
763 	 * SuS says "The readv() function *may* fail if the iovcnt argument
764 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
765 	 * traditionally returned zero for zero segments, so...
766 	 */
767 	if (nr_segs == 0) {
768 		ret = 0;
769 		goto out;
770 	}
771 
772 	/*
773 	 * First get the "struct iovec" from user memory and
774 	 * verify all the pointers
775 	 */
776 	if (nr_segs > UIO_MAXIOV) {
777 		ret = -EINVAL;
778 		goto out;
779 	}
780 	if (nr_segs > fast_segs) {
781 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
782 		if (iov == NULL) {
783 			ret = -ENOMEM;
784 			goto out;
785 		}
786 	}
787 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
788 		ret = -EFAULT;
789 		goto out;
790 	}
791 
792 	/*
793 	 * According to the Single Unix Specification we should return EINVAL
794 	 * if an element length is < 0 when cast to ssize_t or if the
795 	 * total length would overflow the ssize_t return value of the
796 	 * system call.
797 	 *
798 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
799 	 * overflow case.
800 	 */
801 	ret = 0;
802 	for (seg = 0; seg < nr_segs; seg++) {
803 		void __user *buf = iov[seg].iov_base;
804 		ssize_t len = (ssize_t)iov[seg].iov_len;
805 
806 		/* see if we we're about to use an invalid len or if
807 		 * it's about to overflow ssize_t */
808 		if (len < 0) {
809 			ret = -EINVAL;
810 			goto out;
811 		}
812 		if (type >= 0
813 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
814 			ret = -EFAULT;
815 			goto out;
816 		}
817 		if (len > MAX_RW_COUNT - ret) {
818 			len = MAX_RW_COUNT - ret;
819 			iov[seg].iov_len = len;
820 		}
821 		ret += len;
822 	}
823 out:
824 	*ret_pointer = iov;
825 	return ret;
826 }
827 
828 #ifdef CONFIG_COMPAT
compat_rw_copy_check_uvector(int type,const struct compat_iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)829 ssize_t compat_rw_copy_check_uvector(int type,
830 		const struct compat_iovec __user *uvector, unsigned long nr_segs,
831 		unsigned long fast_segs, struct iovec *fast_pointer,
832 		struct iovec **ret_pointer)
833 {
834 	compat_ssize_t tot_len;
835 	struct iovec *iov = *ret_pointer = fast_pointer;
836 	ssize_t ret = 0;
837 	int seg;
838 
839 	/*
840 	 * SuS says "The readv() function *may* fail if the iovcnt argument
841 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
842 	 * traditionally returned zero for zero segments, so...
843 	 */
844 	if (nr_segs == 0)
845 		goto out;
846 
847 	ret = -EINVAL;
848 	if (nr_segs > UIO_MAXIOV)
849 		goto out;
850 	if (nr_segs > fast_segs) {
851 		ret = -ENOMEM;
852 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
853 		if (iov == NULL)
854 			goto out;
855 	}
856 	*ret_pointer = iov;
857 
858 	ret = -EFAULT;
859 	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
860 		goto out;
861 
862 	/*
863 	 * Single unix specification:
864 	 * We should -EINVAL if an element length is not >= 0 and fitting an
865 	 * ssize_t.
866 	 *
867 	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
868 	 * no overflow possibility.
869 	 */
870 	tot_len = 0;
871 	ret = -EINVAL;
872 	for (seg = 0; seg < nr_segs; seg++) {
873 		compat_uptr_t buf;
874 		compat_ssize_t len;
875 
876 		if (__get_user(len, &uvector->iov_len) ||
877 		   __get_user(buf, &uvector->iov_base)) {
878 			ret = -EFAULT;
879 			goto out;
880 		}
881 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
882 			goto out;
883 		if (type >= 0 &&
884 		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
885 			ret = -EFAULT;
886 			goto out;
887 		}
888 		if (len > MAX_RW_COUNT - tot_len)
889 			len = MAX_RW_COUNT - tot_len;
890 		tot_len += len;
891 		iov->iov_base = compat_ptr(buf);
892 		iov->iov_len = (compat_size_t) len;
893 		uvector++;
894 		iov++;
895 	}
896 	ret = tot_len;
897 
898 out:
899 	return ret;
900 }
901 #endif
902 
do_iter_read(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)903 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
904 		loff_t *pos, rwf_t flags)
905 {
906 	size_t tot_len;
907 	ssize_t ret = 0;
908 
909 	if (!(file->f_mode & FMODE_READ))
910 		return -EBADF;
911 	if (!(file->f_mode & FMODE_CAN_READ))
912 		return -EINVAL;
913 
914 	tot_len = iov_iter_count(iter);
915 	if (!tot_len)
916 		goto out;
917 	ret = rw_verify_area(READ, file, pos, tot_len);
918 	if (ret < 0)
919 		return ret;
920 
921 	if (file->f_op->read_iter)
922 		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
923 	else
924 		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
925 out:
926 	if (ret >= 0)
927 		fsnotify_access(file);
928 	return ret;
929 }
930 
vfs_iter_read(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)931 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
932 		rwf_t flags)
933 {
934 	if (!file->f_op->read_iter)
935 		return -EINVAL;
936 	return do_iter_read(file, iter, ppos, flags);
937 }
938 EXPORT_SYMBOL(vfs_iter_read);
939 
do_iter_write(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)940 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
941 		loff_t *pos, rwf_t flags)
942 {
943 	size_t tot_len;
944 	ssize_t ret = 0;
945 
946 	if (!(file->f_mode & FMODE_WRITE))
947 		return -EBADF;
948 	if (!(file->f_mode & FMODE_CAN_WRITE))
949 		return -EINVAL;
950 
951 	tot_len = iov_iter_count(iter);
952 	if (!tot_len)
953 		return 0;
954 	ret = rw_verify_area(WRITE, file, pos, tot_len);
955 	if (ret < 0)
956 		return ret;
957 
958 	if (file->f_op->write_iter)
959 		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
960 	else
961 		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
962 	if (ret > 0)
963 		fsnotify_modify(file);
964 	return ret;
965 }
966 
vfs_iter_write(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)967 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
968 		rwf_t flags)
969 {
970 	if (!file->f_op->write_iter)
971 		return -EINVAL;
972 	return do_iter_write(file, iter, ppos, flags);
973 }
974 EXPORT_SYMBOL(vfs_iter_write);
975 
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)976 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
977 		  unsigned long vlen, loff_t *pos, rwf_t flags)
978 {
979 	struct iovec iovstack[UIO_FASTIOV];
980 	struct iovec *iov = iovstack;
981 	struct iov_iter iter;
982 	ssize_t ret;
983 
984 	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
985 	if (ret >= 0) {
986 		ret = do_iter_read(file, &iter, pos, flags);
987 		kfree(iov);
988 	}
989 
990 	return ret;
991 }
992 
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)993 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
994 		   unsigned long vlen, loff_t *pos, rwf_t flags)
995 {
996 	struct iovec iovstack[UIO_FASTIOV];
997 	struct iovec *iov = iovstack;
998 	struct iov_iter iter;
999 	ssize_t ret;
1000 
1001 	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1002 	if (ret >= 0) {
1003 		file_start_write(file);
1004 		ret = do_iter_write(file, &iter, pos, flags);
1005 		file_end_write(file);
1006 		kfree(iov);
1007 	}
1008 	return ret;
1009 }
1010 
do_readv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1011 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1012 			unsigned long vlen, rwf_t flags)
1013 {
1014 	struct fd f = fdget_pos(fd);
1015 	ssize_t ret = -EBADF;
1016 
1017 	if (f.file) {
1018 		loff_t pos = file_pos_read(f.file);
1019 		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1020 		if (ret >= 0)
1021 			file_pos_write(f.file, pos);
1022 		fdput_pos(f);
1023 	}
1024 
1025 	if (ret > 0)
1026 		add_rchar(current, ret);
1027 	inc_syscr(current);
1028 	return ret;
1029 }
1030 
do_writev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1031 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1032 			 unsigned long vlen, rwf_t flags)
1033 {
1034 	struct fd f = fdget_pos(fd);
1035 	ssize_t ret = -EBADF;
1036 
1037 	if (f.file) {
1038 		loff_t pos = file_pos_read(f.file);
1039 		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1040 		if (ret >= 0)
1041 			file_pos_write(f.file, pos);
1042 		fdput_pos(f);
1043 	}
1044 
1045 	if (ret > 0)
1046 		add_wchar(current, ret);
1047 	inc_syscw(current);
1048 	return ret;
1049 }
1050 
pos_from_hilo(unsigned long high,unsigned long low)1051 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1052 {
1053 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
1054 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1055 }
1056 
do_preadv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1057 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1058 			 unsigned long vlen, loff_t pos, rwf_t flags)
1059 {
1060 	struct fd f;
1061 	ssize_t ret = -EBADF;
1062 
1063 	if (pos < 0)
1064 		return -EINVAL;
1065 
1066 	f = fdget(fd);
1067 	if (f.file) {
1068 		ret = -ESPIPE;
1069 		if (f.file->f_mode & FMODE_PREAD)
1070 			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1071 		fdput(f);
1072 	}
1073 
1074 	if (ret > 0)
1075 		add_rchar(current, ret);
1076 	inc_syscr(current);
1077 	return ret;
1078 }
1079 
do_pwritev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1080 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1081 			  unsigned long vlen, loff_t pos, rwf_t flags)
1082 {
1083 	struct fd f;
1084 	ssize_t ret = -EBADF;
1085 
1086 	if (pos < 0)
1087 		return -EINVAL;
1088 
1089 	f = fdget(fd);
1090 	if (f.file) {
1091 		ret = -ESPIPE;
1092 		if (f.file->f_mode & FMODE_PWRITE)
1093 			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1094 		fdput(f);
1095 	}
1096 
1097 	if (ret > 0)
1098 		add_wchar(current, ret);
1099 	inc_syscw(current);
1100 	return ret;
1101 }
1102 
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1103 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1104 		unsigned long, vlen)
1105 {
1106 	return do_readv(fd, vec, vlen, 0);
1107 }
1108 
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1109 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1110 		unsigned long, vlen)
1111 {
1112 	return do_writev(fd, vec, vlen, 0);
1113 }
1114 
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1115 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1116 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1117 {
1118 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1119 
1120 	return do_preadv(fd, vec, vlen, pos, 0);
1121 }
1122 
SYSCALL_DEFINE6(preadv2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1123 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1124 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1125 		rwf_t, flags)
1126 {
1127 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1128 
1129 	if (pos == -1)
1130 		return do_readv(fd, vec, vlen, flags);
1131 
1132 	return do_preadv(fd, vec, vlen, pos, flags);
1133 }
1134 
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1135 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1136 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1137 {
1138 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1139 
1140 	return do_pwritev(fd, vec, vlen, pos, 0);
1141 }
1142 
SYSCALL_DEFINE6(pwritev2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1143 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1144 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1145 		rwf_t, flags)
1146 {
1147 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1148 
1149 	if (pos == -1)
1150 		return do_writev(fd, vec, vlen, flags);
1151 
1152 	return do_pwritev(fd, vec, vlen, pos, flags);
1153 }
1154 
1155 #ifdef CONFIG_COMPAT
compat_readv(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1156 static size_t compat_readv(struct file *file,
1157 			   const struct compat_iovec __user *vec,
1158 			   unsigned long vlen, loff_t *pos, rwf_t flags)
1159 {
1160 	struct iovec iovstack[UIO_FASTIOV];
1161 	struct iovec *iov = iovstack;
1162 	struct iov_iter iter;
1163 	ssize_t ret;
1164 
1165 	ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1166 	if (ret >= 0) {
1167 		ret = do_iter_read(file, &iter, pos, flags);
1168 		kfree(iov);
1169 	}
1170 	if (ret > 0)
1171 		add_rchar(current, ret);
1172 	inc_syscr(current);
1173 	return ret;
1174 }
1175 
do_compat_readv(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1176 static size_t do_compat_readv(compat_ulong_t fd,
1177 				 const struct compat_iovec __user *vec,
1178 				 compat_ulong_t vlen, rwf_t flags)
1179 {
1180 	struct fd f = fdget_pos(fd);
1181 	ssize_t ret;
1182 	loff_t pos;
1183 
1184 	if (!f.file)
1185 		return -EBADF;
1186 	pos = f.file->f_pos;
1187 	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1188 	if (ret >= 0)
1189 		f.file->f_pos = pos;
1190 	fdput_pos(f);
1191 	return ret;
1192 
1193 }
1194 
COMPAT_SYSCALL_DEFINE3(readv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1195 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1196 		const struct compat_iovec __user *,vec,
1197 		compat_ulong_t, vlen)
1198 {
1199 	return do_compat_readv(fd, vec, vlen, 0);
1200 }
1201 
do_compat_preadv64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1202 static long do_compat_preadv64(unsigned long fd,
1203 				  const struct compat_iovec __user *vec,
1204 				  unsigned long vlen, loff_t pos, rwf_t flags)
1205 {
1206 	struct fd f;
1207 	ssize_t ret;
1208 
1209 	if (pos < 0)
1210 		return -EINVAL;
1211 	f = fdget(fd);
1212 	if (!f.file)
1213 		return -EBADF;
1214 	ret = -ESPIPE;
1215 	if (f.file->f_mode & FMODE_PREAD)
1216 		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1217 	fdput(f);
1218 	return ret;
1219 }
1220 
1221 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1222 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1223 		const struct compat_iovec __user *,vec,
1224 		unsigned long, vlen, loff_t, pos)
1225 {
1226 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1227 }
1228 #endif
1229 
COMPAT_SYSCALL_DEFINE5(preadv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1230 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1231 		const struct compat_iovec __user *,vec,
1232 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1233 {
1234 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1235 
1236 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1237 }
1238 
1239 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1240 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1241 		const struct compat_iovec __user *,vec,
1242 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1243 {
1244 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1245 }
1246 #endif
1247 
COMPAT_SYSCALL_DEFINE6(preadv2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1248 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1249 		const struct compat_iovec __user *,vec,
1250 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1251 		rwf_t, flags)
1252 {
1253 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1254 
1255 	if (pos == -1)
1256 		return do_compat_readv(fd, vec, vlen, flags);
1257 
1258 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1259 }
1260 
compat_writev(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1261 static size_t compat_writev(struct file *file,
1262 			    const struct compat_iovec __user *vec,
1263 			    unsigned long vlen, loff_t *pos, rwf_t flags)
1264 {
1265 	struct iovec iovstack[UIO_FASTIOV];
1266 	struct iovec *iov = iovstack;
1267 	struct iov_iter iter;
1268 	ssize_t ret;
1269 
1270 	ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1271 	if (ret >= 0) {
1272 		file_start_write(file);
1273 		ret = do_iter_write(file, &iter, pos, flags);
1274 		file_end_write(file);
1275 		kfree(iov);
1276 	}
1277 	if (ret > 0)
1278 		add_wchar(current, ret);
1279 	inc_syscw(current);
1280 	return ret;
1281 }
1282 
do_compat_writev(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1283 static size_t do_compat_writev(compat_ulong_t fd,
1284 				  const struct compat_iovec __user* vec,
1285 				  compat_ulong_t vlen, rwf_t flags)
1286 {
1287 	struct fd f = fdget_pos(fd);
1288 	ssize_t ret;
1289 	loff_t pos;
1290 
1291 	if (!f.file)
1292 		return -EBADF;
1293 	pos = f.file->f_pos;
1294 	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1295 	if (ret >= 0)
1296 		f.file->f_pos = pos;
1297 	fdput_pos(f);
1298 	return ret;
1299 }
1300 
COMPAT_SYSCALL_DEFINE3(writev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1301 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1302 		const struct compat_iovec __user *, vec,
1303 		compat_ulong_t, vlen)
1304 {
1305 	return do_compat_writev(fd, vec, vlen, 0);
1306 }
1307 
do_compat_pwritev64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1308 static long do_compat_pwritev64(unsigned long fd,
1309 				   const struct compat_iovec __user *vec,
1310 				   unsigned long vlen, loff_t pos, rwf_t flags)
1311 {
1312 	struct fd f;
1313 	ssize_t ret;
1314 
1315 	if (pos < 0)
1316 		return -EINVAL;
1317 	f = fdget(fd);
1318 	if (!f.file)
1319 		return -EBADF;
1320 	ret = -ESPIPE;
1321 	if (f.file->f_mode & FMODE_PWRITE)
1322 		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1323 	fdput(f);
1324 	return ret;
1325 }
1326 
1327 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1328 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1329 		const struct compat_iovec __user *,vec,
1330 		unsigned long, vlen, loff_t, pos)
1331 {
1332 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1333 }
1334 #endif
1335 
COMPAT_SYSCALL_DEFINE5(pwritev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1336 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1337 		const struct compat_iovec __user *,vec,
1338 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1339 {
1340 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1341 
1342 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1343 }
1344 
1345 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1346 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1347 		const struct compat_iovec __user *,vec,
1348 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1349 {
1350 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1351 }
1352 #endif
1353 
COMPAT_SYSCALL_DEFINE6(pwritev2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1354 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1355 		const struct compat_iovec __user *,vec,
1356 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1357 {
1358 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1359 
1360 	if (pos == -1)
1361 		return do_compat_writev(fd, vec, vlen, flags);
1362 
1363 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1364 }
1365 
1366 #endif
1367 
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1368 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1369 		  	   size_t count, loff_t max)
1370 {
1371 	struct fd in, out;
1372 	struct inode *in_inode, *out_inode;
1373 	loff_t pos;
1374 	loff_t out_pos;
1375 	ssize_t retval;
1376 	int fl;
1377 
1378 	/*
1379 	 * Get input file, and verify that it is ok..
1380 	 */
1381 	retval = -EBADF;
1382 	in = fdget(in_fd);
1383 	if (!in.file)
1384 		goto out;
1385 	if (!(in.file->f_mode & FMODE_READ))
1386 		goto fput_in;
1387 	retval = -ESPIPE;
1388 	if (!ppos) {
1389 		pos = in.file->f_pos;
1390 	} else {
1391 		pos = *ppos;
1392 		if (!(in.file->f_mode & FMODE_PREAD))
1393 			goto fput_in;
1394 	}
1395 	retval = rw_verify_area(READ, in.file, &pos, count);
1396 	if (retval < 0)
1397 		goto fput_in;
1398 	if (count > MAX_RW_COUNT)
1399 		count =  MAX_RW_COUNT;
1400 
1401 	/*
1402 	 * Get output file, and verify that it is ok..
1403 	 */
1404 	retval = -EBADF;
1405 	out = fdget(out_fd);
1406 	if (!out.file)
1407 		goto fput_in;
1408 	if (!(out.file->f_mode & FMODE_WRITE))
1409 		goto fput_out;
1410 	retval = -EINVAL;
1411 	in_inode = file_inode(in.file);
1412 	out_inode = file_inode(out.file);
1413 	out_pos = out.file->f_pos;
1414 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1415 	if (retval < 0)
1416 		goto fput_out;
1417 
1418 	if (!max)
1419 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1420 
1421 	if (unlikely(pos + count > max)) {
1422 		retval = -EOVERFLOW;
1423 		if (pos >= max)
1424 			goto fput_out;
1425 		count = max - pos;
1426 	}
1427 
1428 	fl = 0;
1429 #if 0
1430 	/*
1431 	 * We need to debate whether we can enable this or not. The
1432 	 * man page documents EAGAIN return for the output at least,
1433 	 * and the application is arguably buggy if it doesn't expect
1434 	 * EAGAIN on a non-blocking file descriptor.
1435 	 */
1436 	if (in.file->f_flags & O_NONBLOCK)
1437 		fl = SPLICE_F_NONBLOCK;
1438 #endif
1439 	file_start_write(out.file);
1440 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1441 	file_end_write(out.file);
1442 
1443 	if (retval > 0) {
1444 		add_rchar(current, retval);
1445 		add_wchar(current, retval);
1446 		fsnotify_access(in.file);
1447 		fsnotify_modify(out.file);
1448 		out.file->f_pos = out_pos;
1449 		if (ppos)
1450 			*ppos = pos;
1451 		else
1452 			in.file->f_pos = pos;
1453 	}
1454 
1455 	inc_syscr(current);
1456 	inc_syscw(current);
1457 	if (pos > max)
1458 		retval = -EOVERFLOW;
1459 
1460 fput_out:
1461 	fdput(out);
1462 fput_in:
1463 	fdput(in);
1464 out:
1465 	return retval;
1466 }
1467 
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1468 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1469 {
1470 	loff_t pos;
1471 	off_t off;
1472 	ssize_t ret;
1473 
1474 	if (offset) {
1475 		if (unlikely(get_user(off, offset)))
1476 			return -EFAULT;
1477 		pos = off;
1478 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1479 		if (unlikely(put_user(pos, offset)))
1480 			return -EFAULT;
1481 		return ret;
1482 	}
1483 
1484 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1485 }
1486 
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1487 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1488 {
1489 	loff_t pos;
1490 	ssize_t ret;
1491 
1492 	if (offset) {
1493 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1494 			return -EFAULT;
1495 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1496 		if (unlikely(put_user(pos, offset)))
1497 			return -EFAULT;
1498 		return ret;
1499 	}
1500 
1501 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1502 }
1503 
1504 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1505 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1506 		compat_off_t __user *, offset, compat_size_t, count)
1507 {
1508 	loff_t pos;
1509 	off_t off;
1510 	ssize_t ret;
1511 
1512 	if (offset) {
1513 		if (unlikely(get_user(off, offset)))
1514 			return -EFAULT;
1515 		pos = off;
1516 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1517 		if (unlikely(put_user(pos, offset)))
1518 			return -EFAULT;
1519 		return ret;
1520 	}
1521 
1522 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1523 }
1524 
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1525 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1526 		compat_loff_t __user *, offset, compat_size_t, count)
1527 {
1528 	loff_t pos;
1529 	ssize_t ret;
1530 
1531 	if (offset) {
1532 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1533 			return -EFAULT;
1534 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1535 		if (unlikely(put_user(pos, offset)))
1536 			return -EFAULT;
1537 		return ret;
1538 	}
1539 
1540 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1541 }
1542 #endif
1543 
1544 /*
1545  * copy_file_range() differs from regular file read and write in that it
1546  * specifically allows return partial success.  When it does so is up to
1547  * the copy_file_range method.
1548  */
vfs_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1549 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1550 			    struct file *file_out, loff_t pos_out,
1551 			    size_t len, unsigned int flags)
1552 {
1553 	struct inode *inode_in = file_inode(file_in);
1554 	struct inode *inode_out = file_inode(file_out);
1555 	ssize_t ret;
1556 
1557 	if (flags != 0)
1558 		return -EINVAL;
1559 
1560 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1561 		return -EISDIR;
1562 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1563 		return -EINVAL;
1564 
1565 	ret = rw_verify_area(READ, file_in, &pos_in, len);
1566 	if (unlikely(ret))
1567 		return ret;
1568 
1569 	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1570 	if (unlikely(ret))
1571 		return ret;
1572 
1573 	if (!(file_in->f_mode & FMODE_READ) ||
1574 	    !(file_out->f_mode & FMODE_WRITE) ||
1575 	    (file_out->f_flags & O_APPEND))
1576 		return -EBADF;
1577 
1578 	/* this could be relaxed once a method supports cross-fs copies */
1579 	if (inode_in->i_sb != inode_out->i_sb)
1580 		return -EXDEV;
1581 
1582 	if (len == 0)
1583 		return 0;
1584 
1585 	file_start_write(file_out);
1586 
1587 	/*
1588 	 * Try cloning first, this is supported by more file systems, and
1589 	 * more efficient if both clone and copy are supported (e.g. NFS).
1590 	 */
1591 	if (file_in->f_op->clone_file_range) {
1592 		ret = file_in->f_op->clone_file_range(file_in, pos_in,
1593 				file_out, pos_out, len);
1594 		if (ret == 0) {
1595 			ret = len;
1596 			goto done;
1597 		}
1598 	}
1599 
1600 	if (file_out->f_op->copy_file_range) {
1601 		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1602 						      pos_out, len, flags);
1603 		if (ret != -EOPNOTSUPP)
1604 			goto done;
1605 	}
1606 
1607 	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1608 			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1609 
1610 done:
1611 	if (ret > 0) {
1612 		fsnotify_access(file_in);
1613 		add_rchar(current, ret);
1614 		fsnotify_modify(file_out);
1615 		add_wchar(current, ret);
1616 	}
1617 
1618 	inc_syscr(current);
1619 	inc_syscw(current);
1620 
1621 	file_end_write(file_out);
1622 
1623 	return ret;
1624 }
1625 EXPORT_SYMBOL(vfs_copy_file_range);
1626 
SYSCALL_DEFINE6(copy_file_range,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1627 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1628 		int, fd_out, loff_t __user *, off_out,
1629 		size_t, len, unsigned int, flags)
1630 {
1631 	loff_t pos_in;
1632 	loff_t pos_out;
1633 	struct fd f_in;
1634 	struct fd f_out;
1635 	ssize_t ret = -EBADF;
1636 
1637 	f_in = fdget(fd_in);
1638 	if (!f_in.file)
1639 		goto out2;
1640 
1641 	f_out = fdget(fd_out);
1642 	if (!f_out.file)
1643 		goto out1;
1644 
1645 	ret = -EFAULT;
1646 	if (off_in) {
1647 		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1648 			goto out;
1649 	} else {
1650 		pos_in = f_in.file->f_pos;
1651 	}
1652 
1653 	if (off_out) {
1654 		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1655 			goto out;
1656 	} else {
1657 		pos_out = f_out.file->f_pos;
1658 	}
1659 
1660 	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1661 				  flags);
1662 	if (ret > 0) {
1663 		pos_in += ret;
1664 		pos_out += ret;
1665 
1666 		if (off_in) {
1667 			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1668 				ret = -EFAULT;
1669 		} else {
1670 			f_in.file->f_pos = pos_in;
1671 		}
1672 
1673 		if (off_out) {
1674 			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1675 				ret = -EFAULT;
1676 		} else {
1677 			f_out.file->f_pos = pos_out;
1678 		}
1679 	}
1680 
1681 out:
1682 	fdput(f_out);
1683 out1:
1684 	fdput(f_in);
1685 out2:
1686 	return ret;
1687 }
1688 
clone_verify_area(struct file * file,loff_t pos,u64 len,bool write)1689 static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1690 {
1691 	struct inode *inode = file_inode(file);
1692 
1693 	if (unlikely(pos < 0))
1694 		return -EINVAL;
1695 
1696 	 if (unlikely((loff_t) (pos + len) < 0))
1697 		return -EINVAL;
1698 
1699 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1700 		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1701 		int retval;
1702 
1703 		retval = locks_mandatory_area(inode, file, pos, end,
1704 				write ? F_WRLCK : F_RDLCK);
1705 		if (retval < 0)
1706 			return retval;
1707 	}
1708 
1709 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1710 }
1711 
1712 /*
1713  * Check that the two inodes are eligible for cloning, the ranges make
1714  * sense, and then flush all dirty data.  Caller must ensure that the
1715  * inodes have been locked against any other modifications.
1716  *
1717  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1718  * the usual negative error code.
1719  */
vfs_clone_file_prep_inodes(struct inode * inode_in,loff_t pos_in,struct inode * inode_out,loff_t pos_out,u64 * len,bool is_dedupe)1720 int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1721 			       struct inode *inode_out, loff_t pos_out,
1722 			       u64 *len, bool is_dedupe)
1723 {
1724 	loff_t bs = inode_out->i_sb->s_blocksize;
1725 	loff_t blen;
1726 	loff_t isize;
1727 	bool same_inode = (inode_in == inode_out);
1728 	int ret;
1729 
1730 	/* Don't touch certain kinds of inodes */
1731 	if (IS_IMMUTABLE(inode_out))
1732 		return -EPERM;
1733 
1734 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1735 		return -ETXTBSY;
1736 
1737 	/* Don't reflink dirs, pipes, sockets... */
1738 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1739 		return -EISDIR;
1740 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1741 		return -EINVAL;
1742 
1743 	/* Are we going all the way to the end? */
1744 	isize = i_size_read(inode_in);
1745 	if (isize == 0)
1746 		return 0;
1747 
1748 	/* Zero length dedupe exits immediately; reflink goes to EOF. */
1749 	if (*len == 0) {
1750 		if (is_dedupe || pos_in == isize)
1751 			return 0;
1752 		if (pos_in > isize)
1753 			return -EINVAL;
1754 		*len = isize - pos_in;
1755 	}
1756 
1757 	/* Ensure offsets don't wrap and the input is inside i_size */
1758 	if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1759 	    pos_in + *len > isize)
1760 		return -EINVAL;
1761 
1762 	/* Don't allow dedupe past EOF in the dest file */
1763 	if (is_dedupe) {
1764 		loff_t	disize;
1765 
1766 		disize = i_size_read(inode_out);
1767 		if (pos_out >= disize || pos_out + *len > disize)
1768 			return -EINVAL;
1769 	}
1770 
1771 	/* If we're linking to EOF, continue to the block boundary. */
1772 	if (pos_in + *len == isize)
1773 		blen = ALIGN(isize, bs) - pos_in;
1774 	else
1775 		blen = *len;
1776 
1777 	/* Only reflink if we're aligned to block boundaries */
1778 	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1779 	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1780 		return -EINVAL;
1781 
1782 	/* Don't allow overlapped reflink within the same file */
1783 	if (same_inode) {
1784 		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1785 			return -EINVAL;
1786 	}
1787 
1788 	/* Wait for the completion of any pending IOs on both files */
1789 	inode_dio_wait(inode_in);
1790 	if (!same_inode)
1791 		inode_dio_wait(inode_out);
1792 
1793 	ret = filemap_write_and_wait_range(inode_in->i_mapping,
1794 			pos_in, pos_in + *len - 1);
1795 	if (ret)
1796 		return ret;
1797 
1798 	ret = filemap_write_and_wait_range(inode_out->i_mapping,
1799 			pos_out, pos_out + *len - 1);
1800 	if (ret)
1801 		return ret;
1802 
1803 	/*
1804 	 * Check that the extents are the same.
1805 	 */
1806 	if (is_dedupe) {
1807 		bool		is_same = false;
1808 
1809 		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1810 				inode_out, pos_out, *len, &is_same);
1811 		if (ret)
1812 			return ret;
1813 		if (!is_same)
1814 			return -EBADE;
1815 	}
1816 
1817 	return 1;
1818 }
1819 EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1820 
do_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,u64 len)1821 int do_clone_file_range(struct file *file_in, loff_t pos_in,
1822 			struct file *file_out, loff_t pos_out, u64 len)
1823 {
1824 	struct inode *inode_in = file_inode(file_in);
1825 	struct inode *inode_out = file_inode(file_out);
1826 	int ret;
1827 
1828 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1829 		return -EISDIR;
1830 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1831 		return -EINVAL;
1832 
1833 	/*
1834 	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1835 	 * the same mount. Practically, they only need to be on the same file
1836 	 * system.
1837 	 */
1838 	if (inode_in->i_sb != inode_out->i_sb)
1839 		return -EXDEV;
1840 
1841 	if (!(file_in->f_mode & FMODE_READ) ||
1842 	    !(file_out->f_mode & FMODE_WRITE) ||
1843 	    (file_out->f_flags & O_APPEND))
1844 		return -EBADF;
1845 
1846 	if (!file_in->f_op->clone_file_range)
1847 		return -EOPNOTSUPP;
1848 
1849 	ret = clone_verify_area(file_in, pos_in, len, false);
1850 	if (ret)
1851 		return ret;
1852 
1853 	ret = clone_verify_area(file_out, pos_out, len, true);
1854 	if (ret)
1855 		return ret;
1856 
1857 	if (pos_in + len > i_size_read(inode_in))
1858 		return -EINVAL;
1859 
1860 	ret = file_in->f_op->clone_file_range(file_in, pos_in,
1861 			file_out, pos_out, len);
1862 	if (!ret) {
1863 		fsnotify_access(file_in);
1864 		fsnotify_modify(file_out);
1865 	}
1866 
1867 	return ret;
1868 }
1869 EXPORT_SYMBOL(do_clone_file_range);
1870 
vfs_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,u64 len)1871 int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1872 			 struct file *file_out, loff_t pos_out, u64 len)
1873 {
1874 	int ret;
1875 
1876 	file_start_write(file_out);
1877 	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
1878 	file_end_write(file_out);
1879 
1880 	return ret;
1881 }
1882 EXPORT_SYMBOL(vfs_clone_file_range);
1883 
1884 /*
1885  * Read a page's worth of file data into the page cache.  Return the page
1886  * locked.
1887  */
vfs_dedupe_get_page(struct inode * inode,loff_t offset)1888 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1889 {
1890 	struct address_space *mapping;
1891 	struct page *page;
1892 	pgoff_t n;
1893 
1894 	n = offset >> PAGE_SHIFT;
1895 	mapping = inode->i_mapping;
1896 	page = read_mapping_page(mapping, n, NULL);
1897 	if (IS_ERR(page))
1898 		return page;
1899 	if (!PageUptodate(page)) {
1900 		put_page(page);
1901 		return ERR_PTR(-EIO);
1902 	}
1903 	lock_page(page);
1904 	return page;
1905 }
1906 
1907 /*
1908  * Compare extents of two files to see if they are the same.
1909  * Caller must have locked both inodes to prevent write races.
1910  */
vfs_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dest,loff_t destoff,loff_t len,bool * is_same)1911 int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1912 				  struct inode *dest, loff_t destoff,
1913 				  loff_t len, bool *is_same)
1914 {
1915 	loff_t src_poff;
1916 	loff_t dest_poff;
1917 	void *src_addr;
1918 	void *dest_addr;
1919 	struct page *src_page;
1920 	struct page *dest_page;
1921 	loff_t cmp_len;
1922 	bool same;
1923 	int error;
1924 
1925 	error = -EINVAL;
1926 	same = true;
1927 	while (len) {
1928 		src_poff = srcoff & (PAGE_SIZE - 1);
1929 		dest_poff = destoff & (PAGE_SIZE - 1);
1930 		cmp_len = min(PAGE_SIZE - src_poff,
1931 			      PAGE_SIZE - dest_poff);
1932 		cmp_len = min(cmp_len, len);
1933 		if (cmp_len <= 0)
1934 			goto out_error;
1935 
1936 		src_page = vfs_dedupe_get_page(src, srcoff);
1937 		if (IS_ERR(src_page)) {
1938 			error = PTR_ERR(src_page);
1939 			goto out_error;
1940 		}
1941 		dest_page = vfs_dedupe_get_page(dest, destoff);
1942 		if (IS_ERR(dest_page)) {
1943 			error = PTR_ERR(dest_page);
1944 			unlock_page(src_page);
1945 			put_page(src_page);
1946 			goto out_error;
1947 		}
1948 		src_addr = kmap_atomic(src_page);
1949 		dest_addr = kmap_atomic(dest_page);
1950 
1951 		flush_dcache_page(src_page);
1952 		flush_dcache_page(dest_page);
1953 
1954 		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1955 			same = false;
1956 
1957 		kunmap_atomic(dest_addr);
1958 		kunmap_atomic(src_addr);
1959 		unlock_page(dest_page);
1960 		unlock_page(src_page);
1961 		put_page(dest_page);
1962 		put_page(src_page);
1963 
1964 		if (!same)
1965 			break;
1966 
1967 		srcoff += cmp_len;
1968 		destoff += cmp_len;
1969 		len -= cmp_len;
1970 	}
1971 
1972 	*is_same = same;
1973 	return 0;
1974 
1975 out_error:
1976 	return error;
1977 }
1978 EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1979 
vfs_dedupe_file_range_one(struct file * src_file,loff_t src_pos,struct file * dst_file,loff_t dst_pos,u64 len)1980 int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
1981 			      struct file *dst_file, loff_t dst_pos, u64 len)
1982 {
1983 	s64 ret;
1984 
1985 	ret = mnt_want_write_file(dst_file);
1986 	if (ret)
1987 		return ret;
1988 
1989 	ret = clone_verify_area(dst_file, dst_pos, len, true);
1990 	if (ret < 0)
1991 		goto out_drop_write;
1992 
1993 	ret = -EINVAL;
1994 	if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
1995 		goto out_drop_write;
1996 
1997 	ret = -EXDEV;
1998 	if (src_file->f_path.mnt != dst_file->f_path.mnt)
1999 		goto out_drop_write;
2000 
2001 	ret = -EISDIR;
2002 	if (S_ISDIR(file_inode(dst_file)->i_mode))
2003 		goto out_drop_write;
2004 
2005 	ret = -EINVAL;
2006 	if (!dst_file->f_op->dedupe_file_range)
2007 		goto out_drop_write;
2008 
2009 	ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
2010 						dst_file, dst_pos, len);
2011 out_drop_write:
2012 	mnt_drop_write_file(dst_file);
2013 
2014 	return ret;
2015 }
2016 EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2017 
vfs_dedupe_file_range(struct file * file,struct file_dedupe_range * same)2018 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2019 {
2020 	struct file_dedupe_range_info *info;
2021 	struct inode *src = file_inode(file);
2022 	u64 off;
2023 	u64 len;
2024 	int i;
2025 	int ret;
2026 	u16 count = same->dest_count;
2027 	int deduped;
2028 
2029 	if (!(file->f_mode & FMODE_READ))
2030 		return -EINVAL;
2031 
2032 	if (same->reserved1 || same->reserved2)
2033 		return -EINVAL;
2034 
2035 	off = same->src_offset;
2036 	len = same->src_length;
2037 
2038 	ret = -EISDIR;
2039 	if (S_ISDIR(src->i_mode))
2040 		goto out;
2041 
2042 	ret = -EINVAL;
2043 	if (!S_ISREG(src->i_mode))
2044 		goto out;
2045 
2046 	ret = clone_verify_area(file, off, len, false);
2047 	if (ret < 0)
2048 		goto out;
2049 	ret = 0;
2050 
2051 	if (off + len > i_size_read(src))
2052 		return -EINVAL;
2053 
2054 	/* Arbitrary 1G limit on a single dedupe request, can be raised. */
2055 	len = min_t(u64, len, 1 << 30);
2056 
2057 	/* pre-format output fields to sane values */
2058 	for (i = 0; i < count; i++) {
2059 		same->info[i].bytes_deduped = 0ULL;
2060 		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2061 	}
2062 
2063 	for (i = 0, info = same->info; i < count; i++, info++) {
2064 		struct fd dst_fd = fdget(info->dest_fd);
2065 		struct file *dst_file = dst_fd.file;
2066 
2067 		if (!dst_file) {
2068 			info->status = -EBADF;
2069 			goto next_loop;
2070 		}
2071 
2072 		if (info->reserved) {
2073 			info->status = -EINVAL;
2074 			goto next_fdput;
2075 		}
2076 
2077 		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2078 						    info->dest_offset, len);
2079 		if (deduped == -EBADE)
2080 			info->status = FILE_DEDUPE_RANGE_DIFFERS;
2081 		else if (deduped < 0)
2082 			info->status = deduped;
2083 		else
2084 			info->bytes_deduped = len;
2085 
2086 next_fdput:
2087 		fdput(dst_fd);
2088 next_loop:
2089 		if (fatal_signal_pending(current))
2090 			goto out;
2091 	}
2092 
2093 out:
2094 	return ret;
2095 }
2096 EXPORT_SYMBOL(vfs_dedupe_file_range);
2097