1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * Copyright (c) 2016-2018 Christoph Hellwig.
5  * All Rights Reserved.
6  */
7 #include "xfs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 
21 struct xfs_writepage_ctx {
22 	struct iomap_writepage_ctx ctx;
23 	unsigned int		data_seq;
24 	unsigned int		cow_seq;
25 };
26 
27 static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx * ctx)28 XFS_WPC(struct iomap_writepage_ctx *ctx)
29 {
30 	return container_of(ctx, struct xfs_writepage_ctx, ctx);
31 }
32 
33 /*
34  * Fast and loose check if this write could update the on-disk inode size.
35  */
xfs_ioend_is_append(struct iomap_ioend * ioend)36 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
37 {
38 	return ioend->io_offset + ioend->io_size >
39 		XFS_I(ioend->io_inode)->i_disk_size;
40 }
41 
42 /*
43  * Update on-disk file size now that data has been written to disk.
44  */
45 int
xfs_setfilesize(struct xfs_inode * ip,xfs_off_t offset,size_t size)46 xfs_setfilesize(
47 	struct xfs_inode	*ip,
48 	xfs_off_t		offset,
49 	size_t			size)
50 {
51 	struct xfs_mount	*mp = ip->i_mount;
52 	struct xfs_trans	*tp;
53 	xfs_fsize_t		isize;
54 	int			error;
55 
56 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
57 	if (error)
58 		return error;
59 
60 	xfs_ilock(ip, XFS_ILOCK_EXCL);
61 	isize = xfs_new_eof(ip, offset + size);
62 	if (!isize) {
63 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
64 		xfs_trans_cancel(tp);
65 		return 0;
66 	}
67 
68 	trace_xfs_setfilesize(ip, offset, size);
69 
70 	ip->i_disk_size = isize;
71 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
72 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
73 
74 	return xfs_trans_commit(tp);
75 }
76 
77 /*
78  * IO write completion.
79  */
80 STATIC void
xfs_end_ioend(struct iomap_ioend * ioend)81 xfs_end_ioend(
82 	struct iomap_ioend	*ioend)
83 {
84 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
85 	xfs_off_t		offset = ioend->io_offset;
86 	size_t			size = ioend->io_size;
87 	unsigned int		nofs_flag;
88 	int			error;
89 
90 	/*
91 	 * We can allocate memory here while doing writeback on behalf of
92 	 * memory reclaim.  To avoid memory allocation deadlocks set the
93 	 * task-wide nofs context for the following operations.
94 	 */
95 	nofs_flag = memalloc_nofs_save();
96 
97 	/*
98 	 * Just clean up the in-memory structures if the fs has been shut down.
99 	 */
100 	if (xfs_is_shutdown(ip->i_mount)) {
101 		error = -EIO;
102 		goto done;
103 	}
104 
105 	/*
106 	 * Clean up any COW blocks on an I/O error.
107 	 */
108 	error = blk_status_to_errno(ioend->io_bio->bi_status);
109 	if (unlikely(error)) {
110 		if (ioend->io_flags & IOMAP_F_SHARED)
111 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
112 		goto done;
113 	}
114 
115 	/*
116 	 * Success: commit the COW or unwritten blocks if needed.
117 	 */
118 	if (ioend->io_flags & IOMAP_F_SHARED)
119 		error = xfs_reflink_end_cow(ip, offset, size);
120 	else if (ioend->io_type == IOMAP_UNWRITTEN)
121 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
122 
123 	if (!error && xfs_ioend_is_append(ioend))
124 		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
125 done:
126 	iomap_finish_ioends(ioend, error);
127 	memalloc_nofs_restore(nofs_flag);
128 }
129 
130 /* Finish all pending io completions. */
131 void
xfs_end_io(struct work_struct * work)132 xfs_end_io(
133 	struct work_struct	*work)
134 {
135 	struct xfs_inode	*ip =
136 		container_of(work, struct xfs_inode, i_ioend_work);
137 	struct iomap_ioend	*ioend;
138 	struct list_head	tmp;
139 	unsigned long		flags;
140 
141 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
142 	list_replace_init(&ip->i_ioend_list, &tmp);
143 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
144 
145 	iomap_sort_ioends(&tmp);
146 	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
147 			io_list))) {
148 		list_del_init(&ioend->io_list);
149 		iomap_ioend_try_merge(ioend, &tmp);
150 		xfs_end_ioend(ioend);
151 	}
152 }
153 
154 STATIC void
xfs_end_bio(struct bio * bio)155 xfs_end_bio(
156 	struct bio		*bio)
157 {
158 	struct iomap_ioend	*ioend = bio->bi_private;
159 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
160 	unsigned long		flags;
161 
162 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
163 	if (list_empty(&ip->i_ioend_list))
164 		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
165 					 &ip->i_ioend_work));
166 	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
167 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
168 }
169 
170 /*
171  * Fast revalidation of the cached writeback mapping. Return true if the current
172  * mapping is valid, false otherwise.
173  */
174 static bool
xfs_imap_valid(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,loff_t offset)175 xfs_imap_valid(
176 	struct iomap_writepage_ctx	*wpc,
177 	struct xfs_inode		*ip,
178 	loff_t				offset)
179 {
180 	if (offset < wpc->iomap.offset ||
181 	    offset >= wpc->iomap.offset + wpc->iomap.length)
182 		return false;
183 	/*
184 	 * If this is a COW mapping, it is sufficient to check that the mapping
185 	 * covers the offset. Be careful to check this first because the caller
186 	 * can revalidate a COW mapping without updating the data seqno.
187 	 */
188 	if (wpc->iomap.flags & IOMAP_F_SHARED)
189 		return true;
190 
191 	/*
192 	 * This is not a COW mapping. Check the sequence number of the data fork
193 	 * because concurrent changes could have invalidated the extent. Check
194 	 * the COW fork because concurrent changes since the last time we
195 	 * checked (and found nothing at this offset) could have added
196 	 * overlapping blocks.
197 	 */
198 	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
199 		return false;
200 	if (xfs_inode_has_cow_data(ip) &&
201 	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
202 		return false;
203 	return true;
204 }
205 
206 /*
207  * Pass in a dellalloc extent and convert it to real extents, return the real
208  * extent that maps offset_fsb in wpc->iomap.
209  *
210  * The current page is held locked so nothing could have removed the block
211  * backing offset_fsb, although it could have moved from the COW to the data
212  * fork by another thread.
213  */
214 static int
xfs_convert_blocks(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,int whichfork,loff_t offset)215 xfs_convert_blocks(
216 	struct iomap_writepage_ctx *wpc,
217 	struct xfs_inode	*ip,
218 	int			whichfork,
219 	loff_t			offset)
220 {
221 	int			error;
222 	unsigned		*seq;
223 
224 	if (whichfork == XFS_COW_FORK)
225 		seq = &XFS_WPC(wpc)->cow_seq;
226 	else
227 		seq = &XFS_WPC(wpc)->data_seq;
228 
229 	/*
230 	 * Attempt to allocate whatever delalloc extent currently backs offset
231 	 * and put the result into wpc->iomap.  Allocate in a loop because it
232 	 * may take several attempts to allocate real blocks for a contiguous
233 	 * delalloc extent if free space is sufficiently fragmented.
234 	 */
235 	do {
236 		error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
237 				&wpc->iomap, seq);
238 		if (error)
239 			return error;
240 	} while (wpc->iomap.offset + wpc->iomap.length <= offset);
241 
242 	return 0;
243 }
244 
245 static int
xfs_map_blocks(struct iomap_writepage_ctx * wpc,struct inode * inode,loff_t offset)246 xfs_map_blocks(
247 	struct iomap_writepage_ctx *wpc,
248 	struct inode		*inode,
249 	loff_t			offset)
250 {
251 	struct xfs_inode	*ip = XFS_I(inode);
252 	struct xfs_mount	*mp = ip->i_mount;
253 	ssize_t			count = i_blocksize(inode);
254 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
255 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
256 	xfs_fileoff_t		cow_fsb;
257 	int			whichfork;
258 	struct xfs_bmbt_irec	imap;
259 	struct xfs_iext_cursor	icur;
260 	int			retries = 0;
261 	int			error = 0;
262 
263 	if (xfs_is_shutdown(mp))
264 		return -EIO;
265 
266 	/*
267 	 * COW fork blocks can overlap data fork blocks even if the blocks
268 	 * aren't shared.  COW I/O always takes precedent, so we must always
269 	 * check for overlap on reflink inodes unless the mapping is already a
270 	 * COW one, or the COW fork hasn't changed from the last time we looked
271 	 * at it.
272 	 *
273 	 * It's safe to check the COW fork if_seq here without the ILOCK because
274 	 * we've indirectly protected against concurrent updates: writeback has
275 	 * the page locked, which prevents concurrent invalidations by reflink
276 	 * and directio and prevents concurrent buffered writes to the same
277 	 * page.  Changes to if_seq always happen under i_lock, which protects
278 	 * against concurrent updates and provides a memory barrier on the way
279 	 * out that ensures that we always see the current value.
280 	 */
281 	if (xfs_imap_valid(wpc, ip, offset))
282 		return 0;
283 
284 	/*
285 	 * If we don't have a valid map, now it's time to get a new one for this
286 	 * offset.  This will convert delayed allocations (including COW ones)
287 	 * into real extents.  If we return without a valid map, it means we
288 	 * landed in a hole and we skip the block.
289 	 */
290 retry:
291 	cow_fsb = NULLFILEOFF;
292 	whichfork = XFS_DATA_FORK;
293 	xfs_ilock(ip, XFS_ILOCK_SHARED);
294 	ASSERT(!xfs_need_iread_extents(&ip->i_df));
295 
296 	/*
297 	 * Check if this is offset is covered by a COW extents, and if yes use
298 	 * it directly instead of looking up anything in the data fork.
299 	 */
300 	if (xfs_inode_has_cow_data(ip) &&
301 	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
302 		cow_fsb = imap.br_startoff;
303 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
304 		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
305 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
306 
307 		whichfork = XFS_COW_FORK;
308 		goto allocate_blocks;
309 	}
310 
311 	/*
312 	 * No COW extent overlap. Revalidate now that we may have updated
313 	 * ->cow_seq. If the data mapping is still valid, we're done.
314 	 */
315 	if (xfs_imap_valid(wpc, ip, offset)) {
316 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
317 		return 0;
318 	}
319 
320 	/*
321 	 * If we don't have a valid map, now it's time to get a new one for this
322 	 * offset.  This will convert delayed allocations (including COW ones)
323 	 * into real extents.
324 	 */
325 	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
326 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
327 	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
328 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
329 
330 	/* landed in a hole or beyond EOF? */
331 	if (imap.br_startoff > offset_fsb) {
332 		imap.br_blockcount = imap.br_startoff - offset_fsb;
333 		imap.br_startoff = offset_fsb;
334 		imap.br_startblock = HOLESTARTBLOCK;
335 		imap.br_state = XFS_EXT_NORM;
336 	}
337 
338 	/*
339 	 * Truncate to the next COW extent if there is one.  This is the only
340 	 * opportunity to do this because we can skip COW fork lookups for the
341 	 * subsequent blocks in the mapping; however, the requirement to treat
342 	 * the COW range separately remains.
343 	 */
344 	if (cow_fsb != NULLFILEOFF &&
345 	    cow_fsb < imap.br_startoff + imap.br_blockcount)
346 		imap.br_blockcount = cow_fsb - imap.br_startoff;
347 
348 	/* got a delalloc extent? */
349 	if (imap.br_startblock != HOLESTARTBLOCK &&
350 	    isnullstartblock(imap.br_startblock))
351 		goto allocate_blocks;
352 
353 	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
354 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
355 	return 0;
356 allocate_blocks:
357 	error = xfs_convert_blocks(wpc, ip, whichfork, offset);
358 	if (error) {
359 		/*
360 		 * If we failed to find the extent in the COW fork we might have
361 		 * raced with a COW to data fork conversion or truncate.
362 		 * Restart the lookup to catch the extent in the data fork for
363 		 * the former case, but prevent additional retries to avoid
364 		 * looping forever for the latter case.
365 		 */
366 		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
367 			goto retry;
368 		ASSERT(error != -EAGAIN);
369 		return error;
370 	}
371 
372 	/*
373 	 * Due to merging the return real extent might be larger than the
374 	 * original delalloc one.  Trim the return extent to the next COW
375 	 * boundary again to force a re-lookup.
376 	 */
377 	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
378 		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
379 
380 		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
381 			wpc->iomap.length = cow_offset - wpc->iomap.offset;
382 	}
383 
384 	ASSERT(wpc->iomap.offset <= offset);
385 	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
386 	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
387 	return 0;
388 }
389 
390 static int
xfs_prepare_ioend(struct iomap_ioend * ioend,int status)391 xfs_prepare_ioend(
392 	struct iomap_ioend	*ioend,
393 	int			status)
394 {
395 	unsigned int		nofs_flag;
396 
397 	/*
398 	 * We can allocate memory here while doing writeback on behalf of
399 	 * memory reclaim.  To avoid memory allocation deadlocks set the
400 	 * task-wide nofs context for the following operations.
401 	 */
402 	nofs_flag = memalloc_nofs_save();
403 
404 	/* Convert CoW extents to regular */
405 	if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
406 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
407 				ioend->io_offset, ioend->io_size);
408 	}
409 
410 	memalloc_nofs_restore(nofs_flag);
411 
412 	/* send ioends that might require a transaction to the completion wq */
413 	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
414 	    (ioend->io_flags & IOMAP_F_SHARED))
415 		ioend->io_bio->bi_end_io = xfs_end_bio;
416 	return status;
417 }
418 
419 /*
420  * If the page has delalloc blocks on it, we need to punch them out before we
421  * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
422  * inode that can trip up a later direct I/O read operation on the same region.
423  *
424  * We prevent this by truncating away the delalloc regions on the page.  Because
425  * they are delalloc, we can do this without needing a transaction. Indeed - if
426  * we get ENOSPC errors, we have to be able to do this truncation without a
427  * transaction as there is no space left for block reservation (typically why we
428  * see a ENOSPC in writeback).
429  */
430 static void
xfs_discard_page(struct page * page,loff_t fileoff)431 xfs_discard_page(
432 	struct page		*page,
433 	loff_t			fileoff)
434 {
435 	struct inode		*inode = page->mapping->host;
436 	struct xfs_inode	*ip = XFS_I(inode);
437 	struct xfs_mount	*mp = ip->i_mount;
438 	unsigned int		pageoff = offset_in_page(fileoff);
439 	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
440 	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
441 	int			error;
442 
443 	if (xfs_is_shutdown(mp))
444 		goto out_invalidate;
445 
446 	xfs_alert_ratelimited(mp,
447 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
448 			page, ip->i_ino, fileoff);
449 
450 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
451 			i_blocks_per_page(inode, page) - pageoff_fsb);
452 	if (error && !xfs_is_shutdown(mp))
453 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
454 out_invalidate:
455 	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
456 }
457 
458 static const struct iomap_writeback_ops xfs_writeback_ops = {
459 	.map_blocks		= xfs_map_blocks,
460 	.prepare_ioend		= xfs_prepare_ioend,
461 	.discard_page		= xfs_discard_page,
462 };
463 
464 STATIC int
xfs_vm_writepages(struct address_space * mapping,struct writeback_control * wbc)465 xfs_vm_writepages(
466 	struct address_space	*mapping,
467 	struct writeback_control *wbc)
468 {
469 	struct xfs_writepage_ctx wpc = { };
470 
471 	/*
472 	 * Writing back data in a transaction context can result in recursive
473 	 * transactions. This is bad, so issue a warning and get out of here.
474 	 */
475 	if (WARN_ON_ONCE(current->journal_info))
476 		return 0;
477 
478 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
479 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
480 }
481 
482 STATIC int
xfs_dax_writepages(struct address_space * mapping,struct writeback_control * wbc)483 xfs_dax_writepages(
484 	struct address_space	*mapping,
485 	struct writeback_control *wbc)
486 {
487 	struct xfs_inode	*ip = XFS_I(mapping->host);
488 
489 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
490 	return dax_writeback_mapping_range(mapping,
491 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
492 }
493 
494 STATIC sector_t
xfs_vm_bmap(struct address_space * mapping,sector_t block)495 xfs_vm_bmap(
496 	struct address_space	*mapping,
497 	sector_t		block)
498 {
499 	struct xfs_inode	*ip = XFS_I(mapping->host);
500 
501 	trace_xfs_vm_bmap(ip);
502 
503 	/*
504 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
505 	 * bypasses the file system for actual I/O.  We really can't allow
506 	 * that on reflinks inodes, so we have to skip out here.  And yes,
507 	 * 0 is the magic code for a bmap error.
508 	 *
509 	 * Since we don't pass back blockdev info, we can't return bmap
510 	 * information for rt files either.
511 	 */
512 	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
513 		return 0;
514 	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
515 }
516 
517 STATIC int
xfs_vm_readpage(struct file * unused,struct page * page)518 xfs_vm_readpage(
519 	struct file		*unused,
520 	struct page		*page)
521 {
522 	return iomap_readpage(page, &xfs_read_iomap_ops);
523 }
524 
525 STATIC void
xfs_vm_readahead(struct readahead_control * rac)526 xfs_vm_readahead(
527 	struct readahead_control	*rac)
528 {
529 	iomap_readahead(rac, &xfs_read_iomap_ops);
530 }
531 
532 static int
xfs_iomap_swapfile_activate(struct swap_info_struct * sis,struct file * swap_file,sector_t * span)533 xfs_iomap_swapfile_activate(
534 	struct swap_info_struct		*sis,
535 	struct file			*swap_file,
536 	sector_t			*span)
537 {
538 	sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
539 	return iomap_swapfile_activate(sis, swap_file, span,
540 			&xfs_read_iomap_ops);
541 }
542 
543 const struct address_space_operations xfs_address_space_operations = {
544 	.readpage		= xfs_vm_readpage,
545 	.readahead		= xfs_vm_readahead,
546 	.writepages		= xfs_vm_writepages,
547 	.set_page_dirty		= __set_page_dirty_nobuffers,
548 	.releasepage		= iomap_releasepage,
549 	.invalidatepage		= iomap_invalidatepage,
550 	.bmap			= xfs_vm_bmap,
551 	.direct_IO		= noop_direct_IO,
552 	.migratepage		= iomap_migrate_page,
553 	.is_partially_uptodate  = iomap_is_partially_uptodate,
554 	.error_remove_page	= generic_error_remove_page,
555 	.swap_activate		= xfs_iomap_swapfile_activate,
556 };
557 
558 const struct address_space_operations xfs_dax_aops = {
559 	.writepages		= xfs_dax_writepages,
560 	.direct_IO		= noop_direct_IO,
561 	.set_page_dirty		= __set_page_dirty_no_writeback,
562 	.invalidatepage		= noop_invalidatepage,
563 	.swap_activate		= xfs_iomap_swapfile_activate,
564 };
565