1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
ext4_fc_reset_inode(struct inode * inode)187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
ext4_fc_init_inode(struct inode * inode)195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	INIT_LIST_HEAD(&ei->i_fc_dilist);
203 	init_waitqueue_head(&ei->i_fc_wait);
204 	atomic_set(&ei->i_fc_updates, 0);
205 }
206 
207 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211 	wait_queue_head_t *wq;
212 	struct ext4_inode_info *ei = EXT4_I(inode);
213 
214 #if (BITS_PER_LONG < 64)
215 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 			EXT4_STATE_FC_COMMITTING);
217 	wq = bit_waitqueue(&ei->i_state_flags,
218 				EXT4_STATE_FC_COMMITTING);
219 #else
220 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 			EXT4_STATE_FC_COMMITTING);
222 	wq = bit_waitqueue(&ei->i_flags,
223 				EXT4_STATE_FC_COMMITTING);
224 #endif
225 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 	schedule();
229 	finish_wait(wq, &wait.wq_entry);
230 }
231 
ext4_fc_disabled(struct super_block * sb)232 static bool ext4_fc_disabled(struct super_block *sb)
233 {
234 	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
235 		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
236 }
237 
238 /*
239  * Inform Ext4's fast about start of an inode update
240  *
241  * This function is called by the high level call VFS callbacks before
242  * performing any inode update. This function blocks if there's an ongoing
243  * fast commit on the inode in question.
244  */
ext4_fc_start_update(struct inode * inode)245 void ext4_fc_start_update(struct inode *inode)
246 {
247 	struct ext4_inode_info *ei = EXT4_I(inode);
248 
249 	if (ext4_fc_disabled(inode->i_sb))
250 		return;
251 
252 restart:
253 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
254 	if (list_empty(&ei->i_fc_list))
255 		goto out;
256 
257 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
258 		ext4_fc_wait_committing_inode(inode);
259 		goto restart;
260 	}
261 out:
262 	atomic_inc(&ei->i_fc_updates);
263 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
264 }
265 
266 /*
267  * Stop inode update and wake up waiting fast commits if any.
268  */
ext4_fc_stop_update(struct inode * inode)269 void ext4_fc_stop_update(struct inode *inode)
270 {
271 	struct ext4_inode_info *ei = EXT4_I(inode);
272 
273 	if (ext4_fc_disabled(inode->i_sb))
274 		return;
275 
276 	if (atomic_dec_and_test(&ei->i_fc_updates))
277 		wake_up_all(&ei->i_fc_wait);
278 }
279 
280 /*
281  * Remove inode from fast commit list. If the inode is being committed
282  * we wait until inode commit is done.
283  */
ext4_fc_del(struct inode * inode)284 void ext4_fc_del(struct inode *inode)
285 {
286 	struct ext4_inode_info *ei = EXT4_I(inode);
287 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
288 	struct ext4_fc_dentry_update *fc_dentry;
289 
290 	if (ext4_fc_disabled(inode->i_sb))
291 		return;
292 
293 restart:
294 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
295 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
296 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
297 		return;
298 	}
299 
300 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
301 		ext4_fc_wait_committing_inode(inode);
302 		goto restart;
303 	}
304 
305 	if (!list_empty(&ei->i_fc_list))
306 		list_del_init(&ei->i_fc_list);
307 
308 	/*
309 	 * Since this inode is getting removed, let's also remove all FC
310 	 * dentry create references, since it is not needed to log it anyways.
311 	 */
312 	if (list_empty(&ei->i_fc_dilist)) {
313 		spin_unlock(&sbi->s_fc_lock);
314 		return;
315 	}
316 
317 	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
318 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
319 	list_del_init(&fc_dentry->fcd_list);
320 	list_del_init(&fc_dentry->fcd_dilist);
321 
322 	WARN_ON(!list_empty(&ei->i_fc_dilist));
323 	spin_unlock(&sbi->s_fc_lock);
324 
325 	if (fc_dentry->fcd_name.name &&
326 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
327 		kfree(fc_dentry->fcd_name.name);
328 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
329 
330 	return;
331 }
332 
333 /*
334  * Mark file system as fast commit ineligible, and record latest
335  * ineligible transaction tid. This means until the recorded
336  * transaction, commit operation would result in a full jbd2 commit.
337  */
ext4_fc_mark_ineligible(struct super_block * sb,int reason,handle_t * handle)338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
339 {
340 	struct ext4_sb_info *sbi = EXT4_SB(sb);
341 	tid_t tid;
342 
343 	if (ext4_fc_disabled(sb))
344 		return;
345 
346 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
347 	if (handle && !IS_ERR(handle))
348 		tid = handle->h_transaction->t_tid;
349 	else {
350 		read_lock(&sbi->s_journal->j_state_lock);
351 		tid = sbi->s_journal->j_running_transaction ?
352 				sbi->s_journal->j_running_transaction->t_tid : 0;
353 		read_unlock(&sbi->s_journal->j_state_lock);
354 	}
355 	spin_lock(&sbi->s_fc_lock);
356 	if (sbi->s_fc_ineligible_tid < tid)
357 		sbi->s_fc_ineligible_tid = tid;
358 	spin_unlock(&sbi->s_fc_lock);
359 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
360 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
361 }
362 
363 /*
364  * Generic fast commit tracking function. If this is the first time this we are
365  * called after a full commit, we initialize fast commit fields and then call
366  * __fc_track_fn() with update = 0. If we have already been called after a full
367  * commit, we pass update = 1. Based on that, the track function can determine
368  * if it needs to track a field for the first time or if it needs to just
369  * update the previously tracked value.
370  *
371  * If enqueue is set, this function enqueues the inode in fast commit list.
372  */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)373 static int ext4_fc_track_template(
374 	handle_t *handle, struct inode *inode,
375 	int (*__fc_track_fn)(struct inode *, void *, bool),
376 	void *args, int enqueue)
377 {
378 	bool update = false;
379 	struct ext4_inode_info *ei = EXT4_I(inode);
380 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
381 	tid_t tid = 0;
382 	int ret;
383 
384 	tid = handle->h_transaction->t_tid;
385 	mutex_lock(&ei->i_fc_lock);
386 	if (tid == ei->i_sync_tid) {
387 		update = true;
388 	} else {
389 		ext4_fc_reset_inode(inode);
390 		ei->i_sync_tid = tid;
391 	}
392 	ret = __fc_track_fn(inode, args, update);
393 	mutex_unlock(&ei->i_fc_lock);
394 
395 	if (!enqueue)
396 		return ret;
397 
398 	spin_lock(&sbi->s_fc_lock);
399 	if (list_empty(&EXT4_I(inode)->i_fc_list))
400 		list_add_tail(&EXT4_I(inode)->i_fc_list,
401 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
402 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
403 				&sbi->s_fc_q[FC_Q_STAGING] :
404 				&sbi->s_fc_q[FC_Q_MAIN]);
405 	spin_unlock(&sbi->s_fc_lock);
406 
407 	return ret;
408 }
409 
410 struct __track_dentry_update_args {
411 	struct dentry *dentry;
412 	int op;
413 };
414 
415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)416 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
417 {
418 	struct ext4_fc_dentry_update *node;
419 	struct ext4_inode_info *ei = EXT4_I(inode);
420 	struct __track_dentry_update_args *dentry_update =
421 		(struct __track_dentry_update_args *)arg;
422 	struct dentry *dentry = dentry_update->dentry;
423 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
424 
425 	mutex_unlock(&ei->i_fc_lock);
426 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
427 	if (!node) {
428 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
429 		mutex_lock(&ei->i_fc_lock);
430 		return -ENOMEM;
431 	}
432 
433 	node->fcd_op = dentry_update->op;
434 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
435 	node->fcd_ino = inode->i_ino;
436 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
437 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
438 		if (!node->fcd_name.name) {
439 			kmem_cache_free(ext4_fc_dentry_cachep, node);
440 			ext4_fc_mark_ineligible(inode->i_sb,
441 				EXT4_FC_REASON_NOMEM, NULL);
442 			mutex_lock(&ei->i_fc_lock);
443 			return -ENOMEM;
444 		}
445 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
446 			dentry->d_name.len);
447 	} else {
448 		memcpy(node->fcd_iname, dentry->d_name.name,
449 			dentry->d_name.len);
450 		node->fcd_name.name = node->fcd_iname;
451 	}
452 	node->fcd_name.len = dentry->d_name.len;
453 	INIT_LIST_HEAD(&node->fcd_dilist);
454 	spin_lock(&sbi->s_fc_lock);
455 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
456 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
457 		list_add_tail(&node->fcd_list,
458 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
459 	else
460 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
461 
462 	/*
463 	 * This helps us keep a track of all fc_dentry updates which is part of
464 	 * this ext4 inode. So in case the inode is getting unlinked, before
465 	 * even we get a chance to fsync, we could remove all fc_dentry
466 	 * references while evicting the inode in ext4_fc_del().
467 	 * Also with this, we don't need to loop over all the inodes in
468 	 * sbi->s_fc_q to get the corresponding inode in
469 	 * ext4_fc_commit_dentry_updates().
470 	 */
471 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
472 		WARN_ON(!list_empty(&ei->i_fc_dilist));
473 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
474 	}
475 	spin_unlock(&sbi->s_fc_lock);
476 	mutex_lock(&ei->i_fc_lock);
477 
478 	return 0;
479 }
480 
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)481 void __ext4_fc_track_unlink(handle_t *handle,
482 		struct inode *inode, struct dentry *dentry)
483 {
484 	struct __track_dentry_update_args args;
485 	int ret;
486 
487 	args.dentry = dentry;
488 	args.op = EXT4_FC_TAG_UNLINK;
489 
490 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
491 					(void *)&args, 0);
492 	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
493 }
494 
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)495 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
496 {
497 	struct inode *inode = d_inode(dentry);
498 
499 	if (ext4_fc_disabled(inode->i_sb))
500 		return;
501 
502 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 		return;
504 
505 	__ext4_fc_track_unlink(handle, inode, dentry);
506 }
507 
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)508 void __ext4_fc_track_link(handle_t *handle,
509 	struct inode *inode, struct dentry *dentry)
510 {
511 	struct __track_dentry_update_args args;
512 	int ret;
513 
514 	args.dentry = dentry;
515 	args.op = EXT4_FC_TAG_LINK;
516 
517 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518 					(void *)&args, 0);
519 	trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521 
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524 	struct inode *inode = d_inode(dentry);
525 
526 	if (ext4_fc_disabled(inode->i_sb))
527 		return;
528 
529 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
530 		return;
531 
532 	__ext4_fc_track_link(handle, inode, dentry);
533 }
534 
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)535 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
536 			  struct dentry *dentry)
537 {
538 	struct __track_dentry_update_args args;
539 	int ret;
540 
541 	args.dentry = dentry;
542 	args.op = EXT4_FC_TAG_CREAT;
543 
544 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
545 					(void *)&args, 0);
546 	trace_ext4_fc_track_create(handle, inode, dentry, ret);
547 }
548 
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)549 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
550 {
551 	struct inode *inode = d_inode(dentry);
552 
553 	if (ext4_fc_disabled(inode->i_sb))
554 		return;
555 
556 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
557 		return;
558 
559 	__ext4_fc_track_create(handle, inode, dentry);
560 }
561 
562 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)563 static int __track_inode(struct inode *inode, void *arg, bool update)
564 {
565 	if (update)
566 		return -EEXIST;
567 
568 	EXT4_I(inode)->i_fc_lblk_len = 0;
569 
570 	return 0;
571 }
572 
ext4_fc_track_inode(handle_t * handle,struct inode * inode)573 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
574 {
575 	int ret;
576 
577 	if (S_ISDIR(inode->i_mode))
578 		return;
579 
580 	if (ext4_fc_disabled(inode->i_sb))
581 		return;
582 
583 	if (ext4_should_journal_data(inode)) {
584 		ext4_fc_mark_ineligible(inode->i_sb,
585 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
586 		return;
587 	}
588 
589 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
590 		return;
591 
592 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
593 	trace_ext4_fc_track_inode(handle, inode, ret);
594 }
595 
596 struct __track_range_args {
597 	ext4_lblk_t start, end;
598 };
599 
600 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)601 static int __track_range(struct inode *inode, void *arg, bool update)
602 {
603 	struct ext4_inode_info *ei = EXT4_I(inode);
604 	ext4_lblk_t oldstart;
605 	struct __track_range_args *__arg =
606 		(struct __track_range_args *)arg;
607 
608 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
609 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
610 		return -ECANCELED;
611 	}
612 
613 	oldstart = ei->i_fc_lblk_start;
614 
615 	if (update && ei->i_fc_lblk_len > 0) {
616 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
617 		ei->i_fc_lblk_len =
618 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
619 				ei->i_fc_lblk_start + 1;
620 	} else {
621 		ei->i_fc_lblk_start = __arg->start;
622 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
623 	}
624 
625 	return 0;
626 }
627 
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)628 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
629 			 ext4_lblk_t end)
630 {
631 	struct __track_range_args args;
632 	int ret;
633 
634 	if (S_ISDIR(inode->i_mode))
635 		return;
636 
637 	if (ext4_fc_disabled(inode->i_sb))
638 		return;
639 
640 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
641 		return;
642 
643 	args.start = start;
644 	args.end = end;
645 
646 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
647 
648 	trace_ext4_fc_track_range(handle, inode, start, end, ret);
649 }
650 
ext4_fc_submit_bh(struct super_block * sb,bool is_tail)651 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
652 {
653 	blk_opf_t write_flags = REQ_SYNC;
654 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
655 
656 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
657 	if (test_opt(sb, BARRIER) && is_tail)
658 		write_flags |= REQ_FUA | REQ_PREFLUSH;
659 	lock_buffer(bh);
660 	set_buffer_dirty(bh);
661 	set_buffer_uptodate(bh);
662 	bh->b_end_io = ext4_end_buffer_io_sync;
663 	submit_bh(REQ_OP_WRITE | write_flags, bh);
664 	EXT4_SB(sb)->s_fc_bh = NULL;
665 }
666 
667 /* Ext4 commit path routines */
668 
669 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)670 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
671 				u32 *crc)
672 {
673 	void *ret;
674 
675 	ret = memset(dst, 0, len);
676 	if (crc)
677 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
678 	return ret;
679 }
680 
681 /*
682  * Allocate len bytes on a fast commit buffer.
683  *
684  * During the commit time this function is used to manage fast commit
685  * block space. We don't split a fast commit log onto different
686  * blocks. So this function makes sure that if there's not enough space
687  * on the current block, the remaining space in the current block is
688  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
689  * new block is from jbd2 and CRC is updated to reflect the padding
690  * we added.
691  */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)692 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
693 {
694 	struct ext4_fc_tl *tl;
695 	struct ext4_sb_info *sbi = EXT4_SB(sb);
696 	struct buffer_head *bh;
697 	int bsize = sbi->s_journal->j_blocksize;
698 	int ret, off = sbi->s_fc_bytes % bsize;
699 	int pad_len;
700 
701 	/*
702 	 * After allocating len, we should have space at least for a 0 byte
703 	 * padding.
704 	 */
705 	if (len + EXT4_FC_TAG_BASE_LEN > bsize)
706 		return NULL;
707 
708 	if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
709 		/*
710 		 * Only allocate from current buffer if we have enough space for
711 		 * this request AND we have space to add a zero byte padding.
712 		 */
713 		if (!sbi->s_fc_bh) {
714 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
715 			if (ret)
716 				return NULL;
717 			sbi->s_fc_bh = bh;
718 		}
719 		sbi->s_fc_bytes += len;
720 		return sbi->s_fc_bh->b_data + off;
721 	}
722 	/* Need to add PAD tag */
723 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
724 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
725 	pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
726 	tl->fc_len = cpu_to_le16(pad_len);
727 	if (crc)
728 		*crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
729 	if (pad_len > 0)
730 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
731 	ext4_fc_submit_bh(sb, false);
732 
733 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
734 	if (ret)
735 		return NULL;
736 	sbi->s_fc_bh = bh;
737 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
738 	return sbi->s_fc_bh->b_data;
739 }
740 
741 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)742 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
743 				int len, u32 *crc)
744 {
745 	if (crc)
746 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
747 	return memcpy(dst, src, len);
748 }
749 
750 /*
751  * Complete a fast commit by writing tail tag.
752  *
753  * Writing tail tag marks the end of a fast commit. In order to guarantee
754  * atomicity, after writing tail tag, even if there's space remaining
755  * in the block, next commit shouldn't use it. That's why tail tag
756  * has the length as that of the remaining space on the block.
757  */
ext4_fc_write_tail(struct super_block * sb,u32 crc)758 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
759 {
760 	struct ext4_sb_info *sbi = EXT4_SB(sb);
761 	struct ext4_fc_tl tl;
762 	struct ext4_fc_tail tail;
763 	int off, bsize = sbi->s_journal->j_blocksize;
764 	u8 *dst;
765 
766 	/*
767 	 * ext4_fc_reserve_space takes care of allocating an extra block if
768 	 * there's no enough space on this block for accommodating this tail.
769 	 */
770 	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
771 	if (!dst)
772 		return -ENOSPC;
773 
774 	off = sbi->s_fc_bytes % bsize;
775 
776 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
777 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
778 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
779 
780 	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
781 	dst += EXT4_FC_TAG_BASE_LEN;
782 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
783 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
784 	dst += sizeof(tail.fc_tid);
785 	tail.fc_crc = cpu_to_le32(crc);
786 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
787 
788 	ext4_fc_submit_bh(sb, true);
789 
790 	return 0;
791 }
792 
793 /*
794  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
795  * Returns false if there's not enough space.
796  */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)797 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
798 			   u32 *crc)
799 {
800 	struct ext4_fc_tl tl;
801 	u8 *dst;
802 
803 	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
804 	if (!dst)
805 		return false;
806 
807 	tl.fc_tag = cpu_to_le16(tag);
808 	tl.fc_len = cpu_to_le16(len);
809 
810 	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
811 	ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
812 
813 	return true;
814 }
815 
816 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u32 * crc,struct ext4_fc_dentry_update * fc_dentry)817 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
818 				   struct ext4_fc_dentry_update *fc_dentry)
819 {
820 	struct ext4_fc_dentry_info fcd;
821 	struct ext4_fc_tl tl;
822 	int dlen = fc_dentry->fcd_name.len;
823 	u8 *dst = ext4_fc_reserve_space(sb,
824 			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
825 
826 	if (!dst)
827 		return false;
828 
829 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
830 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
831 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
832 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
833 	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
834 	dst += EXT4_FC_TAG_BASE_LEN;
835 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
836 	dst += sizeof(fcd);
837 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
838 
839 	return true;
840 }
841 
842 /*
843  * Writes inode in the fast commit space under TLV with tag @tag.
844  * Returns 0 on success, error on failure.
845  */
ext4_fc_write_inode(struct inode * inode,u32 * crc)846 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
847 {
848 	struct ext4_inode_info *ei = EXT4_I(inode);
849 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
850 	int ret;
851 	struct ext4_iloc iloc;
852 	struct ext4_fc_inode fc_inode;
853 	struct ext4_fc_tl tl;
854 	u8 *dst;
855 
856 	ret = ext4_get_inode_loc(inode, &iloc);
857 	if (ret)
858 		return ret;
859 
860 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
861 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
862 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
863 		inode_len += ei->i_extra_isize;
864 
865 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
866 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
867 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
868 
869 	ret = -ECANCELED;
870 	dst = ext4_fc_reserve_space(inode->i_sb,
871 		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
872 	if (!dst)
873 		goto err;
874 
875 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
876 		goto err;
877 	dst += EXT4_FC_TAG_BASE_LEN;
878 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
879 		goto err;
880 	dst += sizeof(fc_inode);
881 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
882 					inode_len, crc))
883 		goto err;
884 	ret = 0;
885 err:
886 	brelse(iloc.bh);
887 	return ret;
888 }
889 
890 /*
891  * Writes updated data ranges for the inode in question. Updates CRC.
892  * Returns 0 on success, error otherwise.
893  */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)894 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
895 {
896 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
897 	struct ext4_inode_info *ei = EXT4_I(inode);
898 	struct ext4_map_blocks map;
899 	struct ext4_fc_add_range fc_ext;
900 	struct ext4_fc_del_range lrange;
901 	struct ext4_extent *ex;
902 	int ret;
903 
904 	mutex_lock(&ei->i_fc_lock);
905 	if (ei->i_fc_lblk_len == 0) {
906 		mutex_unlock(&ei->i_fc_lock);
907 		return 0;
908 	}
909 	old_blk_size = ei->i_fc_lblk_start;
910 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
911 	ei->i_fc_lblk_len = 0;
912 	mutex_unlock(&ei->i_fc_lock);
913 
914 	cur_lblk_off = old_blk_size;
915 	ext4_debug("will try writing %d to %d for inode %ld\n",
916 		   cur_lblk_off, new_blk_size, inode->i_ino);
917 
918 	while (cur_lblk_off <= new_blk_size) {
919 		map.m_lblk = cur_lblk_off;
920 		map.m_len = new_blk_size - cur_lblk_off + 1;
921 		ret = ext4_map_blocks(NULL, inode, &map, 0);
922 		if (ret < 0)
923 			return -ECANCELED;
924 
925 		if (map.m_len == 0) {
926 			cur_lblk_off++;
927 			continue;
928 		}
929 
930 		if (ret == 0) {
931 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
932 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
933 			lrange.fc_len = cpu_to_le32(map.m_len);
934 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
935 					    sizeof(lrange), (u8 *)&lrange, crc))
936 				return -ENOSPC;
937 		} else {
938 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
939 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
940 
941 			/* Limit the number of blocks in one extent */
942 			map.m_len = min(max, map.m_len);
943 
944 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
945 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
946 			ex->ee_block = cpu_to_le32(map.m_lblk);
947 			ex->ee_len = cpu_to_le16(map.m_len);
948 			ext4_ext_store_pblock(ex, map.m_pblk);
949 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
950 				ext4_ext_mark_unwritten(ex);
951 			else
952 				ext4_ext_mark_initialized(ex);
953 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
954 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
955 				return -ENOSPC;
956 		}
957 
958 		cur_lblk_off += map.m_len;
959 	}
960 
961 	return 0;
962 }
963 
964 
965 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)966 static int ext4_fc_submit_inode_data_all(journal_t *journal)
967 {
968 	struct super_block *sb = journal->j_private;
969 	struct ext4_sb_info *sbi = EXT4_SB(sb);
970 	struct ext4_inode_info *ei;
971 	int ret = 0;
972 
973 	spin_lock(&sbi->s_fc_lock);
974 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
975 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
976 		while (atomic_read(&ei->i_fc_updates)) {
977 			DEFINE_WAIT(wait);
978 
979 			prepare_to_wait(&ei->i_fc_wait, &wait,
980 						TASK_UNINTERRUPTIBLE);
981 			if (atomic_read(&ei->i_fc_updates)) {
982 				spin_unlock(&sbi->s_fc_lock);
983 				schedule();
984 				spin_lock(&sbi->s_fc_lock);
985 			}
986 			finish_wait(&ei->i_fc_wait, &wait);
987 		}
988 		spin_unlock(&sbi->s_fc_lock);
989 		ret = jbd2_submit_inode_data(ei->jinode);
990 		if (ret)
991 			return ret;
992 		spin_lock(&sbi->s_fc_lock);
993 	}
994 	spin_unlock(&sbi->s_fc_lock);
995 
996 	return ret;
997 }
998 
999 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)1000 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1001 {
1002 	struct super_block *sb = journal->j_private;
1003 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1004 	struct ext4_inode_info *pos, *n;
1005 	int ret = 0;
1006 
1007 	spin_lock(&sbi->s_fc_lock);
1008 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1009 		if (!ext4_test_inode_state(&pos->vfs_inode,
1010 					   EXT4_STATE_FC_COMMITTING))
1011 			continue;
1012 		spin_unlock(&sbi->s_fc_lock);
1013 
1014 		ret = jbd2_wait_inode_data(journal, pos->jinode);
1015 		if (ret)
1016 			return ret;
1017 		spin_lock(&sbi->s_fc_lock);
1018 	}
1019 	spin_unlock(&sbi->s_fc_lock);
1020 
1021 	return 0;
1022 }
1023 
1024 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)1025 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1026 __acquires(&sbi->s_fc_lock)
1027 __releases(&sbi->s_fc_lock)
1028 {
1029 	struct super_block *sb = journal->j_private;
1030 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1031 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1032 	struct inode *inode;
1033 	struct ext4_inode_info *ei;
1034 	int ret;
1035 
1036 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1037 		return 0;
1038 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1039 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1040 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1041 			spin_unlock(&sbi->s_fc_lock);
1042 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1043 				ret = -ENOSPC;
1044 				goto lock_and_exit;
1045 			}
1046 			spin_lock(&sbi->s_fc_lock);
1047 			continue;
1048 		}
1049 		/*
1050 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1051 		 * corresponding inode pointer
1052 		 */
1053 		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1054 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1055 				struct ext4_inode_info, i_fc_dilist);
1056 		inode = &ei->vfs_inode;
1057 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1058 
1059 		spin_unlock(&sbi->s_fc_lock);
1060 
1061 		/*
1062 		 * We first write the inode and then the create dirent. This
1063 		 * allows the recovery code to create an unnamed inode first
1064 		 * and then link it to a directory entry. This allows us
1065 		 * to use namei.c routines almost as is and simplifies
1066 		 * the recovery code.
1067 		 */
1068 		ret = ext4_fc_write_inode(inode, crc);
1069 		if (ret)
1070 			goto lock_and_exit;
1071 
1072 		ret = ext4_fc_write_inode_data(inode, crc);
1073 		if (ret)
1074 			goto lock_and_exit;
1075 
1076 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1077 			ret = -ENOSPC;
1078 			goto lock_and_exit;
1079 		}
1080 
1081 		spin_lock(&sbi->s_fc_lock);
1082 	}
1083 	return 0;
1084 lock_and_exit:
1085 	spin_lock(&sbi->s_fc_lock);
1086 	return ret;
1087 }
1088 
ext4_fc_perform_commit(journal_t * journal)1089 static int ext4_fc_perform_commit(journal_t *journal)
1090 {
1091 	struct super_block *sb = journal->j_private;
1092 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1093 	struct ext4_inode_info *iter;
1094 	struct ext4_fc_head head;
1095 	struct inode *inode;
1096 	struct blk_plug plug;
1097 	int ret = 0;
1098 	u32 crc = 0;
1099 
1100 	ret = ext4_fc_submit_inode_data_all(journal);
1101 	if (ret)
1102 		return ret;
1103 
1104 	ret = ext4_fc_wait_inode_data_all(journal);
1105 	if (ret)
1106 		return ret;
1107 
1108 	/*
1109 	 * If file system device is different from journal device, issue a cache
1110 	 * flush before we start writing fast commit blocks.
1111 	 */
1112 	if (journal->j_fs_dev != journal->j_dev)
1113 		blkdev_issue_flush(journal->j_fs_dev);
1114 
1115 	blk_start_plug(&plug);
1116 	if (sbi->s_fc_bytes == 0) {
1117 		/*
1118 		 * Add a head tag only if this is the first fast commit
1119 		 * in this TID.
1120 		 */
1121 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1122 		head.fc_tid = cpu_to_le32(
1123 			sbi->s_journal->j_running_transaction->t_tid);
1124 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1125 			(u8 *)&head, &crc)) {
1126 			ret = -ENOSPC;
1127 			goto out;
1128 		}
1129 	}
1130 
1131 	spin_lock(&sbi->s_fc_lock);
1132 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1133 	if (ret) {
1134 		spin_unlock(&sbi->s_fc_lock);
1135 		goto out;
1136 	}
1137 
1138 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1139 		inode = &iter->vfs_inode;
1140 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1141 			continue;
1142 
1143 		spin_unlock(&sbi->s_fc_lock);
1144 		ret = ext4_fc_write_inode_data(inode, &crc);
1145 		if (ret)
1146 			goto out;
1147 		ret = ext4_fc_write_inode(inode, &crc);
1148 		if (ret)
1149 			goto out;
1150 		spin_lock(&sbi->s_fc_lock);
1151 	}
1152 	spin_unlock(&sbi->s_fc_lock);
1153 
1154 	ret = ext4_fc_write_tail(sb, crc);
1155 
1156 out:
1157 	blk_finish_plug(&plug);
1158 	return ret;
1159 }
1160 
ext4_fc_update_stats(struct super_block * sb,int status,u64 commit_time,int nblks,tid_t commit_tid)1161 static void ext4_fc_update_stats(struct super_block *sb, int status,
1162 				 u64 commit_time, int nblks, tid_t commit_tid)
1163 {
1164 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1165 
1166 	ext4_debug("Fast commit ended with status = %d for tid %u",
1167 			status, commit_tid);
1168 	if (status == EXT4_FC_STATUS_OK) {
1169 		stats->fc_num_commits++;
1170 		stats->fc_numblks += nblks;
1171 		if (likely(stats->s_fc_avg_commit_time))
1172 			stats->s_fc_avg_commit_time =
1173 				(commit_time +
1174 				 stats->s_fc_avg_commit_time * 3) / 4;
1175 		else
1176 			stats->s_fc_avg_commit_time = commit_time;
1177 	} else if (status == EXT4_FC_STATUS_FAILED ||
1178 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1179 		if (status == EXT4_FC_STATUS_FAILED)
1180 			stats->fc_failed_commits++;
1181 		stats->fc_ineligible_commits++;
1182 	} else {
1183 		stats->fc_skipped_commits++;
1184 	}
1185 	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1186 }
1187 
1188 /*
1189  * The main commit entry point. Performs a fast commit for transaction
1190  * commit_tid if needed. If it's not possible to perform a fast commit
1191  * due to various reasons, we fall back to full commit. Returns 0
1192  * on success, error otherwise.
1193  */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1194 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1195 {
1196 	struct super_block *sb = journal->j_private;
1197 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1198 	int nblks = 0, ret, bsize = journal->j_blocksize;
1199 	int subtid = atomic_read(&sbi->s_fc_subtid);
1200 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1201 	ktime_t start_time, commit_time;
1202 
1203 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1204 		return jbd2_complete_transaction(journal, commit_tid);
1205 
1206 	trace_ext4_fc_commit_start(sb, commit_tid);
1207 
1208 	start_time = ktime_get();
1209 
1210 restart_fc:
1211 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1212 	if (ret == -EALREADY) {
1213 		/* There was an ongoing commit, check if we need to restart */
1214 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1215 			commit_tid > journal->j_commit_sequence)
1216 			goto restart_fc;
1217 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1218 				commit_tid);
1219 		return 0;
1220 	} else if (ret) {
1221 		/*
1222 		 * Commit couldn't start. Just update stats and perform a
1223 		 * full commit.
1224 		 */
1225 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1226 				commit_tid);
1227 		return jbd2_complete_transaction(journal, commit_tid);
1228 	}
1229 
1230 	/*
1231 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1232 	 * if we are fast commit ineligible.
1233 	 */
1234 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1235 		status = EXT4_FC_STATUS_INELIGIBLE;
1236 		goto fallback;
1237 	}
1238 
1239 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1240 	ret = ext4_fc_perform_commit(journal);
1241 	if (ret < 0) {
1242 		status = EXT4_FC_STATUS_FAILED;
1243 		goto fallback;
1244 	}
1245 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1246 	ret = jbd2_fc_wait_bufs(journal, nblks);
1247 	if (ret < 0) {
1248 		status = EXT4_FC_STATUS_FAILED;
1249 		goto fallback;
1250 	}
1251 	atomic_inc(&sbi->s_fc_subtid);
1252 	ret = jbd2_fc_end_commit(journal);
1253 	/*
1254 	 * weight the commit time higher than the average time so we
1255 	 * don't react too strongly to vast changes in the commit time
1256 	 */
1257 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1258 	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1259 	return ret;
1260 
1261 fallback:
1262 	ret = jbd2_fc_end_commit_fallback(journal);
1263 	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1264 	return ret;
1265 }
1266 
1267 /*
1268  * Fast commit cleanup routine. This is called after every fast commit and
1269  * full commit. full is true if we are called after a full commit.
1270  */
ext4_fc_cleanup(journal_t * journal,int full,tid_t tid)1271 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1272 {
1273 	struct super_block *sb = journal->j_private;
1274 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1275 	struct ext4_inode_info *iter, *iter_n;
1276 	struct ext4_fc_dentry_update *fc_dentry;
1277 
1278 	if (full && sbi->s_fc_bh)
1279 		sbi->s_fc_bh = NULL;
1280 
1281 	trace_ext4_fc_cleanup(journal, full, tid);
1282 	jbd2_fc_release_bufs(journal);
1283 
1284 	spin_lock(&sbi->s_fc_lock);
1285 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1286 				 i_fc_list) {
1287 		list_del_init(&iter->i_fc_list);
1288 		ext4_clear_inode_state(&iter->vfs_inode,
1289 				       EXT4_STATE_FC_COMMITTING);
1290 		if (iter->i_sync_tid <= tid)
1291 			ext4_fc_reset_inode(&iter->vfs_inode);
1292 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1293 		smp_mb();
1294 #if (BITS_PER_LONG < 64)
1295 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1296 #else
1297 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1298 #endif
1299 	}
1300 
1301 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1302 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1303 					     struct ext4_fc_dentry_update,
1304 					     fcd_list);
1305 		list_del_init(&fc_dentry->fcd_list);
1306 		list_del_init(&fc_dentry->fcd_dilist);
1307 		spin_unlock(&sbi->s_fc_lock);
1308 
1309 		if (fc_dentry->fcd_name.name &&
1310 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1311 			kfree(fc_dentry->fcd_name.name);
1312 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1313 		spin_lock(&sbi->s_fc_lock);
1314 	}
1315 
1316 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1317 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1318 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1319 				&sbi->s_fc_q[FC_Q_MAIN]);
1320 
1321 	if (tid >= sbi->s_fc_ineligible_tid) {
1322 		sbi->s_fc_ineligible_tid = 0;
1323 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1324 	}
1325 
1326 	if (full)
1327 		sbi->s_fc_bytes = 0;
1328 	spin_unlock(&sbi->s_fc_lock);
1329 	trace_ext4_fc_stats(sb);
1330 }
1331 
1332 /* Ext4 Replay Path Routines */
1333 
1334 /* Helper struct for dentry replay routines */
1335 struct dentry_info_args {
1336 	int parent_ino, dname_len, ino, inode_len;
1337 	char *dname;
1338 };
1339 
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl,u8 * val)1340 static inline void tl_to_darg(struct dentry_info_args *darg,
1341 			      struct ext4_fc_tl *tl, u8 *val)
1342 {
1343 	struct ext4_fc_dentry_info fcd;
1344 
1345 	memcpy(&fcd, val, sizeof(fcd));
1346 
1347 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1348 	darg->ino = le32_to_cpu(fcd.fc_ino);
1349 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1350 	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1351 }
1352 
ext4_fc_get_tl(struct ext4_fc_tl * tl,u8 * val)1353 static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
1354 {
1355 	memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
1356 	tl->fc_len = le16_to_cpu(tl->fc_len);
1357 	tl->fc_tag = le16_to_cpu(tl->fc_tag);
1358 }
1359 
1360 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1361 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1362 				 u8 *val)
1363 {
1364 	struct inode *inode, *old_parent;
1365 	struct qstr entry;
1366 	struct dentry_info_args darg;
1367 	int ret = 0;
1368 
1369 	tl_to_darg(&darg, tl, val);
1370 
1371 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1372 			darg.parent_ino, darg.dname_len);
1373 
1374 	entry.name = darg.dname;
1375 	entry.len = darg.dname_len;
1376 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1377 
1378 	if (IS_ERR(inode)) {
1379 		ext4_debug("Inode %d not found", darg.ino);
1380 		return 0;
1381 	}
1382 
1383 	old_parent = ext4_iget(sb, darg.parent_ino,
1384 				EXT4_IGET_NORMAL);
1385 	if (IS_ERR(old_parent)) {
1386 		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1387 		iput(inode);
1388 		return 0;
1389 	}
1390 
1391 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1392 	/* -ENOENT ok coz it might not exist anymore. */
1393 	if (ret == -ENOENT)
1394 		ret = 0;
1395 	iput(old_parent);
1396 	iput(inode);
1397 	return ret;
1398 }
1399 
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1400 static int ext4_fc_replay_link_internal(struct super_block *sb,
1401 				struct dentry_info_args *darg,
1402 				struct inode *inode)
1403 {
1404 	struct inode *dir = NULL;
1405 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1406 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1407 	int ret = 0;
1408 
1409 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1410 	if (IS_ERR(dir)) {
1411 		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1412 		dir = NULL;
1413 		goto out;
1414 	}
1415 
1416 	dentry_dir = d_obtain_alias(dir);
1417 	if (IS_ERR(dentry_dir)) {
1418 		ext4_debug("Failed to obtain dentry");
1419 		dentry_dir = NULL;
1420 		goto out;
1421 	}
1422 
1423 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1424 	if (!dentry_inode) {
1425 		ext4_debug("Inode dentry not created.");
1426 		ret = -ENOMEM;
1427 		goto out;
1428 	}
1429 
1430 	ret = __ext4_link(dir, inode, dentry_inode);
1431 	/*
1432 	 * It's possible that link already existed since data blocks
1433 	 * for the dir in question got persisted before we crashed OR
1434 	 * we replayed this tag and crashed before the entire replay
1435 	 * could complete.
1436 	 */
1437 	if (ret && ret != -EEXIST) {
1438 		ext4_debug("Failed to link\n");
1439 		goto out;
1440 	}
1441 
1442 	ret = 0;
1443 out:
1444 	if (dentry_dir) {
1445 		d_drop(dentry_dir);
1446 		dput(dentry_dir);
1447 	} else if (dir) {
1448 		iput(dir);
1449 	}
1450 	if (dentry_inode) {
1451 		d_drop(dentry_inode);
1452 		dput(dentry_inode);
1453 	}
1454 
1455 	return ret;
1456 }
1457 
1458 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1459 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1460 			       u8 *val)
1461 {
1462 	struct inode *inode;
1463 	struct dentry_info_args darg;
1464 	int ret = 0;
1465 
1466 	tl_to_darg(&darg, tl, val);
1467 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1468 			darg.parent_ino, darg.dname_len);
1469 
1470 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1471 	if (IS_ERR(inode)) {
1472 		ext4_debug("Inode not found.");
1473 		return 0;
1474 	}
1475 
1476 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1477 	iput(inode);
1478 	return ret;
1479 }
1480 
1481 /*
1482  * Record all the modified inodes during replay. We use this later to setup
1483  * block bitmaps correctly.
1484  */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1485 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1486 {
1487 	struct ext4_fc_replay_state *state;
1488 	int i;
1489 
1490 	state = &EXT4_SB(sb)->s_fc_replay_state;
1491 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1492 		if (state->fc_modified_inodes[i] == ino)
1493 			return 0;
1494 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1495 		int *fc_modified_inodes;
1496 
1497 		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1498 				sizeof(int) * (state->fc_modified_inodes_size +
1499 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1500 				GFP_KERNEL);
1501 		if (!fc_modified_inodes)
1502 			return -ENOMEM;
1503 		state->fc_modified_inodes = fc_modified_inodes;
1504 		state->fc_modified_inodes_size +=
1505 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1506 	}
1507 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1508 	return 0;
1509 }
1510 
1511 /*
1512  * Inode replay function
1513  */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1514 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1515 				u8 *val)
1516 {
1517 	struct ext4_fc_inode fc_inode;
1518 	struct ext4_inode *raw_inode;
1519 	struct ext4_inode *raw_fc_inode;
1520 	struct inode *inode = NULL;
1521 	struct ext4_iloc iloc;
1522 	int inode_len, ino, ret, tag = tl->fc_tag;
1523 	struct ext4_extent_header *eh;
1524 	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1525 
1526 	memcpy(&fc_inode, val, sizeof(fc_inode));
1527 
1528 	ino = le32_to_cpu(fc_inode.fc_ino);
1529 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1530 
1531 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1532 	if (!IS_ERR(inode)) {
1533 		ext4_ext_clear_bb(inode);
1534 		iput(inode);
1535 	}
1536 	inode = NULL;
1537 
1538 	ret = ext4_fc_record_modified_inode(sb, ino);
1539 	if (ret)
1540 		goto out;
1541 
1542 	raw_fc_inode = (struct ext4_inode *)
1543 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1544 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1545 	if (ret)
1546 		goto out;
1547 
1548 	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1549 	raw_inode = ext4_raw_inode(&iloc);
1550 
1551 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1552 	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1553 	       inode_len - off_gen);
1554 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1555 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1556 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1557 			memset(eh, 0, sizeof(*eh));
1558 			eh->eh_magic = EXT4_EXT_MAGIC;
1559 			eh->eh_max = cpu_to_le16(
1560 				(sizeof(raw_inode->i_block) -
1561 				 sizeof(struct ext4_extent_header))
1562 				 / sizeof(struct ext4_extent));
1563 		}
1564 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1565 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1566 			sizeof(raw_inode->i_block));
1567 	}
1568 
1569 	/* Immediately update the inode on disk. */
1570 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1571 	if (ret)
1572 		goto out;
1573 	ret = sync_dirty_buffer(iloc.bh);
1574 	if (ret)
1575 		goto out;
1576 	ret = ext4_mark_inode_used(sb, ino);
1577 	if (ret)
1578 		goto out;
1579 
1580 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1581 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1582 	if (IS_ERR(inode)) {
1583 		ext4_debug("Inode not found.");
1584 		return -EFSCORRUPTED;
1585 	}
1586 
1587 	/*
1588 	 * Our allocator could have made different decisions than before
1589 	 * crashing. This should be fixed but until then, we calculate
1590 	 * the number of blocks the inode.
1591 	 */
1592 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1593 		ext4_ext_replay_set_iblocks(inode);
1594 
1595 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1596 	ext4_reset_inode_seed(inode);
1597 
1598 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1599 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1600 	sync_dirty_buffer(iloc.bh);
1601 	brelse(iloc.bh);
1602 out:
1603 	iput(inode);
1604 	if (!ret)
1605 		blkdev_issue_flush(sb->s_bdev);
1606 
1607 	return 0;
1608 }
1609 
1610 /*
1611  * Dentry create replay function.
1612  *
1613  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1614  * inode for which we are trying to create a dentry here, should already have
1615  * been replayed before we start here.
1616  */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1617 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1618 				 u8 *val)
1619 {
1620 	int ret = 0;
1621 	struct inode *inode = NULL;
1622 	struct inode *dir = NULL;
1623 	struct dentry_info_args darg;
1624 
1625 	tl_to_darg(&darg, tl, val);
1626 
1627 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1628 			darg.parent_ino, darg.dname_len);
1629 
1630 	/* This takes care of update group descriptor and other metadata */
1631 	ret = ext4_mark_inode_used(sb, darg.ino);
1632 	if (ret)
1633 		goto out;
1634 
1635 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1636 	if (IS_ERR(inode)) {
1637 		ext4_debug("inode %d not found.", darg.ino);
1638 		inode = NULL;
1639 		ret = -EINVAL;
1640 		goto out;
1641 	}
1642 
1643 	if (S_ISDIR(inode->i_mode)) {
1644 		/*
1645 		 * If we are creating a directory, we need to make sure that the
1646 		 * dot and dot dot dirents are setup properly.
1647 		 */
1648 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1649 		if (IS_ERR(dir)) {
1650 			ext4_debug("Dir %d not found.", darg.ino);
1651 			goto out;
1652 		}
1653 		ret = ext4_init_new_dir(NULL, dir, inode);
1654 		iput(dir);
1655 		if (ret) {
1656 			ret = 0;
1657 			goto out;
1658 		}
1659 	}
1660 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1661 	if (ret)
1662 		goto out;
1663 	set_nlink(inode, 1);
1664 	ext4_mark_inode_dirty(NULL, inode);
1665 out:
1666 	iput(inode);
1667 	return ret;
1668 }
1669 
1670 /*
1671  * Record physical disk regions which are in use as per fast commit area,
1672  * and used by inodes during replay phase. Our simple replay phase
1673  * allocator excludes these regions from allocation.
1674  */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len,int replay)1675 int ext4_fc_record_regions(struct super_block *sb, int ino,
1676 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1677 {
1678 	struct ext4_fc_replay_state *state;
1679 	struct ext4_fc_alloc_region *region;
1680 
1681 	state = &EXT4_SB(sb)->s_fc_replay_state;
1682 	/*
1683 	 * during replay phase, the fc_regions_valid may not same as
1684 	 * fc_regions_used, update it when do new additions.
1685 	 */
1686 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1687 		state->fc_regions_used = state->fc_regions_valid;
1688 	if (state->fc_regions_used == state->fc_regions_size) {
1689 		struct ext4_fc_alloc_region *fc_regions;
1690 
1691 		fc_regions = krealloc(state->fc_regions,
1692 				      sizeof(struct ext4_fc_alloc_region) *
1693 				      (state->fc_regions_size +
1694 				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1695 				      GFP_KERNEL);
1696 		if (!fc_regions)
1697 			return -ENOMEM;
1698 		state->fc_regions_size +=
1699 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1700 		state->fc_regions = fc_regions;
1701 	}
1702 	region = &state->fc_regions[state->fc_regions_used++];
1703 	region->ino = ino;
1704 	region->lblk = lblk;
1705 	region->pblk = pblk;
1706 	region->len = len;
1707 
1708 	if (replay)
1709 		state->fc_regions_valid++;
1710 
1711 	return 0;
1712 }
1713 
1714 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1715 static int ext4_fc_replay_add_range(struct super_block *sb,
1716 				    struct ext4_fc_tl *tl, u8 *val)
1717 {
1718 	struct ext4_fc_add_range fc_add_ex;
1719 	struct ext4_extent newex, *ex;
1720 	struct inode *inode;
1721 	ext4_lblk_t start, cur;
1722 	int remaining, len;
1723 	ext4_fsblk_t start_pblk;
1724 	struct ext4_map_blocks map;
1725 	struct ext4_ext_path *path = NULL;
1726 	int ret;
1727 
1728 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1729 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1730 
1731 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1732 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1733 		ext4_ext_get_actual_len(ex));
1734 
1735 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1736 	if (IS_ERR(inode)) {
1737 		ext4_debug("Inode not found.");
1738 		return 0;
1739 	}
1740 
1741 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1742 	if (ret)
1743 		goto out;
1744 
1745 	start = le32_to_cpu(ex->ee_block);
1746 	start_pblk = ext4_ext_pblock(ex);
1747 	len = ext4_ext_get_actual_len(ex);
1748 
1749 	cur = start;
1750 	remaining = len;
1751 	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1752 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1753 		  inode->i_ino);
1754 
1755 	while (remaining > 0) {
1756 		map.m_lblk = cur;
1757 		map.m_len = remaining;
1758 		map.m_pblk = 0;
1759 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1760 
1761 		if (ret < 0)
1762 			goto out;
1763 
1764 		if (ret == 0) {
1765 			/* Range is not mapped */
1766 			path = ext4_find_extent(inode, cur, NULL, 0);
1767 			if (IS_ERR(path))
1768 				goto out;
1769 			memset(&newex, 0, sizeof(newex));
1770 			newex.ee_block = cpu_to_le32(cur);
1771 			ext4_ext_store_pblock(
1772 				&newex, start_pblk + cur - start);
1773 			newex.ee_len = cpu_to_le16(map.m_len);
1774 			if (ext4_ext_is_unwritten(ex))
1775 				ext4_ext_mark_unwritten(&newex);
1776 			down_write(&EXT4_I(inode)->i_data_sem);
1777 			ret = ext4_ext_insert_extent(
1778 				NULL, inode, &path, &newex, 0);
1779 			up_write((&EXT4_I(inode)->i_data_sem));
1780 			ext4_free_ext_path(path);
1781 			if (ret)
1782 				goto out;
1783 			goto next;
1784 		}
1785 
1786 		if (start_pblk + cur - start != map.m_pblk) {
1787 			/*
1788 			 * Logical to physical mapping changed. This can happen
1789 			 * if this range was removed and then reallocated to
1790 			 * map to new physical blocks during a fast commit.
1791 			 */
1792 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1793 					ext4_ext_is_unwritten(ex),
1794 					start_pblk + cur - start);
1795 			if (ret)
1796 				goto out;
1797 			/*
1798 			 * Mark the old blocks as free since they aren't used
1799 			 * anymore. We maintain an array of all the modified
1800 			 * inodes. In case these blocks are still used at either
1801 			 * a different logical range in the same inode or in
1802 			 * some different inode, we will mark them as allocated
1803 			 * at the end of the FC replay using our array of
1804 			 * modified inodes.
1805 			 */
1806 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1807 			goto next;
1808 		}
1809 
1810 		/* Range is mapped and needs a state change */
1811 		ext4_debug("Converting from %ld to %d %lld",
1812 				map.m_flags & EXT4_MAP_UNWRITTEN,
1813 			ext4_ext_is_unwritten(ex), map.m_pblk);
1814 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1815 					ext4_ext_is_unwritten(ex), map.m_pblk);
1816 		if (ret)
1817 			goto out;
1818 		/*
1819 		 * We may have split the extent tree while toggling the state.
1820 		 * Try to shrink the extent tree now.
1821 		 */
1822 		ext4_ext_replay_shrink_inode(inode, start + len);
1823 next:
1824 		cur += map.m_len;
1825 		remaining -= map.m_len;
1826 	}
1827 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1828 					sb->s_blocksize_bits);
1829 out:
1830 	iput(inode);
1831 	return 0;
1832 }
1833 
1834 /* Replay DEL_RANGE tag */
1835 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1836 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1837 			 u8 *val)
1838 {
1839 	struct inode *inode;
1840 	struct ext4_fc_del_range lrange;
1841 	struct ext4_map_blocks map;
1842 	ext4_lblk_t cur, remaining;
1843 	int ret;
1844 
1845 	memcpy(&lrange, val, sizeof(lrange));
1846 	cur = le32_to_cpu(lrange.fc_lblk);
1847 	remaining = le32_to_cpu(lrange.fc_len);
1848 
1849 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1850 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1851 
1852 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1853 	if (IS_ERR(inode)) {
1854 		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1855 		return 0;
1856 	}
1857 
1858 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1859 	if (ret)
1860 		goto out;
1861 
1862 	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1863 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1864 			le32_to_cpu(lrange.fc_len));
1865 	while (remaining > 0) {
1866 		map.m_lblk = cur;
1867 		map.m_len = remaining;
1868 
1869 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1870 		if (ret < 0)
1871 			goto out;
1872 		if (ret > 0) {
1873 			remaining -= ret;
1874 			cur += ret;
1875 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1876 		} else {
1877 			remaining -= map.m_len;
1878 			cur += map.m_len;
1879 		}
1880 	}
1881 
1882 	down_write(&EXT4_I(inode)->i_data_sem);
1883 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1884 				le32_to_cpu(lrange.fc_lblk) +
1885 				le32_to_cpu(lrange.fc_len) - 1);
1886 	up_write(&EXT4_I(inode)->i_data_sem);
1887 	if (ret)
1888 		goto out;
1889 	ext4_ext_replay_shrink_inode(inode,
1890 		i_size_read(inode) >> sb->s_blocksize_bits);
1891 	ext4_mark_inode_dirty(NULL, inode);
1892 out:
1893 	iput(inode);
1894 	return 0;
1895 }
1896 
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1897 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1898 {
1899 	struct ext4_fc_replay_state *state;
1900 	struct inode *inode;
1901 	struct ext4_ext_path *path = NULL;
1902 	struct ext4_map_blocks map;
1903 	int i, ret, j;
1904 	ext4_lblk_t cur, end;
1905 
1906 	state = &EXT4_SB(sb)->s_fc_replay_state;
1907 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1908 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1909 			EXT4_IGET_NORMAL);
1910 		if (IS_ERR(inode)) {
1911 			ext4_debug("Inode %d not found.",
1912 				state->fc_modified_inodes[i]);
1913 			continue;
1914 		}
1915 		cur = 0;
1916 		end = EXT_MAX_BLOCKS;
1917 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1918 			iput(inode);
1919 			continue;
1920 		}
1921 		while (cur < end) {
1922 			map.m_lblk = cur;
1923 			map.m_len = end - cur;
1924 
1925 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1926 			if (ret < 0)
1927 				break;
1928 
1929 			if (ret > 0) {
1930 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1931 				if (!IS_ERR(path)) {
1932 					for (j = 0; j < path->p_depth; j++)
1933 						ext4_mb_mark_bb(inode->i_sb,
1934 							path[j].p_block, 1, 1);
1935 					ext4_free_ext_path(path);
1936 				}
1937 				cur += ret;
1938 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1939 							map.m_len, 1);
1940 			} else {
1941 				cur = cur + (map.m_len ? map.m_len : 1);
1942 			}
1943 		}
1944 		iput(inode);
1945 	}
1946 }
1947 
1948 /*
1949  * Check if block is in excluded regions for block allocation. The simple
1950  * allocator that runs during replay phase is calls this function to see
1951  * if it is okay to use a block.
1952  */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1953 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1954 {
1955 	int i;
1956 	struct ext4_fc_replay_state *state;
1957 
1958 	state = &EXT4_SB(sb)->s_fc_replay_state;
1959 	for (i = 0; i < state->fc_regions_valid; i++) {
1960 		if (state->fc_regions[i].ino == 0 ||
1961 			state->fc_regions[i].len == 0)
1962 			continue;
1963 		if (in_range(blk, state->fc_regions[i].pblk,
1964 					state->fc_regions[i].len))
1965 			return true;
1966 	}
1967 	return false;
1968 }
1969 
1970 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1971 void ext4_fc_replay_cleanup(struct super_block *sb)
1972 {
1973 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1974 
1975 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1976 	kfree(sbi->s_fc_replay_state.fc_regions);
1977 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1978 }
1979 
ext4_fc_tag_len_isvalid(struct ext4_fc_tl * tl,u8 * val,u8 * end)1980 static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
1981 					   u8 *val, u8 *end)
1982 {
1983 	if (val + tl->fc_len > end)
1984 		return false;
1985 
1986 	/* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
1987 	 * journal rescan before do CRC check. Other tags length check will
1988 	 * rely on CRC check.
1989 	 */
1990 	switch (tl->fc_tag) {
1991 	case EXT4_FC_TAG_ADD_RANGE:
1992 		return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
1993 	case EXT4_FC_TAG_TAIL:
1994 		return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
1995 	case EXT4_FC_TAG_HEAD:
1996 		return (sizeof(struct ext4_fc_head) == tl->fc_len);
1997 	case EXT4_FC_TAG_DEL_RANGE:
1998 	case EXT4_FC_TAG_LINK:
1999 	case EXT4_FC_TAG_UNLINK:
2000 	case EXT4_FC_TAG_CREAT:
2001 	case EXT4_FC_TAG_INODE:
2002 	case EXT4_FC_TAG_PAD:
2003 	default:
2004 		return true;
2005 	}
2006 }
2007 
2008 /*
2009  * Recovery Scan phase handler
2010  *
2011  * This function is called during the scan phase and is responsible
2012  * for doing following things:
2013  * - Make sure the fast commit area has valid tags for replay
2014  * - Count number of tags that need to be replayed by the replay handler
2015  * - Verify CRC
2016  * - Create a list of excluded blocks for allocation during replay phase
2017  *
2018  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2019  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2020  * to indicate that scan has finished and JBD2 can now start replay phase.
2021  * It returns a negative error to indicate that there was an error. At the end
2022  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2023  * to indicate the number of tags that need to replayed during the replay phase.
2024  */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)2025 static int ext4_fc_replay_scan(journal_t *journal,
2026 				struct buffer_head *bh, int off,
2027 				tid_t expected_tid)
2028 {
2029 	struct super_block *sb = journal->j_private;
2030 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2031 	struct ext4_fc_replay_state *state;
2032 	int ret = JBD2_FC_REPLAY_CONTINUE;
2033 	struct ext4_fc_add_range ext;
2034 	struct ext4_fc_tl tl;
2035 	struct ext4_fc_tail tail;
2036 	__u8 *start, *end, *cur, *val;
2037 	struct ext4_fc_head head;
2038 	struct ext4_extent *ex;
2039 
2040 	state = &sbi->s_fc_replay_state;
2041 
2042 	start = (u8 *)bh->b_data;
2043 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2044 
2045 	if (state->fc_replay_expected_off == 0) {
2046 		state->fc_cur_tag = 0;
2047 		state->fc_replay_num_tags = 0;
2048 		state->fc_crc = 0;
2049 		state->fc_regions = NULL;
2050 		state->fc_regions_valid = state->fc_regions_used =
2051 			state->fc_regions_size = 0;
2052 		/* Check if we can stop early */
2053 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2054 			!= EXT4_FC_TAG_HEAD)
2055 			return 0;
2056 	}
2057 
2058 	if (off != state->fc_replay_expected_off) {
2059 		ret = -EFSCORRUPTED;
2060 		goto out_err;
2061 	}
2062 
2063 	state->fc_replay_expected_off++;
2064 	for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
2065 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2066 		ext4_fc_get_tl(&tl, cur);
2067 		val = cur + EXT4_FC_TAG_BASE_LEN;
2068 		if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
2069 			ret = state->fc_replay_num_tags ?
2070 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2071 			goto out_err;
2072 		}
2073 		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2074 			   tag2str(tl.fc_tag), bh->b_blocknr);
2075 		switch (tl.fc_tag) {
2076 		case EXT4_FC_TAG_ADD_RANGE:
2077 			memcpy(&ext, val, sizeof(ext));
2078 			ex = (struct ext4_extent *)&ext.fc_ex;
2079 			ret = ext4_fc_record_regions(sb,
2080 				le32_to_cpu(ext.fc_ino),
2081 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2082 				ext4_ext_get_actual_len(ex), 0);
2083 			if (ret < 0)
2084 				break;
2085 			ret = JBD2_FC_REPLAY_CONTINUE;
2086 			fallthrough;
2087 		case EXT4_FC_TAG_DEL_RANGE:
2088 		case EXT4_FC_TAG_LINK:
2089 		case EXT4_FC_TAG_UNLINK:
2090 		case EXT4_FC_TAG_CREAT:
2091 		case EXT4_FC_TAG_INODE:
2092 		case EXT4_FC_TAG_PAD:
2093 			state->fc_cur_tag++;
2094 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2095 				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2096 			break;
2097 		case EXT4_FC_TAG_TAIL:
2098 			state->fc_cur_tag++;
2099 			memcpy(&tail, val, sizeof(tail));
2100 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2101 						EXT4_FC_TAG_BASE_LEN +
2102 						offsetof(struct ext4_fc_tail,
2103 						fc_crc));
2104 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2105 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2106 				state->fc_replay_num_tags = state->fc_cur_tag;
2107 				state->fc_regions_valid =
2108 					state->fc_regions_used;
2109 			} else {
2110 				ret = state->fc_replay_num_tags ?
2111 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2112 			}
2113 			state->fc_crc = 0;
2114 			break;
2115 		case EXT4_FC_TAG_HEAD:
2116 			memcpy(&head, val, sizeof(head));
2117 			if (le32_to_cpu(head.fc_features) &
2118 				~EXT4_FC_SUPPORTED_FEATURES) {
2119 				ret = -EOPNOTSUPP;
2120 				break;
2121 			}
2122 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2123 				ret = JBD2_FC_REPLAY_STOP;
2124 				break;
2125 			}
2126 			state->fc_cur_tag++;
2127 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2128 				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2129 			break;
2130 		default:
2131 			ret = state->fc_replay_num_tags ?
2132 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2133 		}
2134 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2135 			break;
2136 	}
2137 
2138 out_err:
2139 	trace_ext4_fc_replay_scan(sb, ret, off);
2140 	return ret;
2141 }
2142 
2143 /*
2144  * Main recovery path entry point.
2145  * The meaning of return codes is similar as above.
2146  */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2147 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2148 				enum passtype pass, int off, tid_t expected_tid)
2149 {
2150 	struct super_block *sb = journal->j_private;
2151 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2152 	struct ext4_fc_tl tl;
2153 	__u8 *start, *end, *cur, *val;
2154 	int ret = JBD2_FC_REPLAY_CONTINUE;
2155 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2156 	struct ext4_fc_tail tail;
2157 
2158 	if (pass == PASS_SCAN) {
2159 		state->fc_current_pass = PASS_SCAN;
2160 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2161 	}
2162 
2163 	if (state->fc_current_pass != pass) {
2164 		state->fc_current_pass = pass;
2165 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2166 	}
2167 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2168 		ext4_debug("Replay stops\n");
2169 		ext4_fc_set_bitmaps_and_counters(sb);
2170 		return 0;
2171 	}
2172 
2173 #ifdef CONFIG_EXT4_DEBUG
2174 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2175 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2176 		return JBD2_FC_REPLAY_STOP;
2177 	}
2178 #endif
2179 
2180 	start = (u8 *)bh->b_data;
2181 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2182 
2183 	for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
2184 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2185 		ext4_fc_get_tl(&tl, cur);
2186 		val = cur + EXT4_FC_TAG_BASE_LEN;
2187 
2188 		if (state->fc_replay_num_tags == 0) {
2189 			ret = JBD2_FC_REPLAY_STOP;
2190 			ext4_fc_set_bitmaps_and_counters(sb);
2191 			break;
2192 		}
2193 
2194 		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2195 		state->fc_replay_num_tags--;
2196 		switch (tl.fc_tag) {
2197 		case EXT4_FC_TAG_LINK:
2198 			ret = ext4_fc_replay_link(sb, &tl, val);
2199 			break;
2200 		case EXT4_FC_TAG_UNLINK:
2201 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2202 			break;
2203 		case EXT4_FC_TAG_ADD_RANGE:
2204 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2205 			break;
2206 		case EXT4_FC_TAG_CREAT:
2207 			ret = ext4_fc_replay_create(sb, &tl, val);
2208 			break;
2209 		case EXT4_FC_TAG_DEL_RANGE:
2210 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2211 			break;
2212 		case EXT4_FC_TAG_INODE:
2213 			ret = ext4_fc_replay_inode(sb, &tl, val);
2214 			break;
2215 		case EXT4_FC_TAG_PAD:
2216 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2217 					     tl.fc_len, 0);
2218 			break;
2219 		case EXT4_FC_TAG_TAIL:
2220 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2221 					     0, tl.fc_len, 0);
2222 			memcpy(&tail, val, sizeof(tail));
2223 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2224 			break;
2225 		case EXT4_FC_TAG_HEAD:
2226 			break;
2227 		default:
2228 			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2229 			ret = -ECANCELED;
2230 			break;
2231 		}
2232 		if (ret < 0)
2233 			break;
2234 		ret = JBD2_FC_REPLAY_CONTINUE;
2235 	}
2236 	return ret;
2237 }
2238 
ext4_fc_init(struct super_block * sb,journal_t * journal)2239 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2240 {
2241 	/*
2242 	 * We set replay callback even if fast commit disabled because we may
2243 	 * could still have fast commit blocks that need to be replayed even if
2244 	 * fast commit has now been turned off.
2245 	 */
2246 	journal->j_fc_replay_callback = ext4_fc_replay;
2247 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2248 		return;
2249 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2250 }
2251 
2252 static const char *fc_ineligible_reasons[] = {
2253 	"Extended attributes changed",
2254 	"Cross rename",
2255 	"Journal flag changed",
2256 	"Insufficient memory",
2257 	"Swap boot",
2258 	"Resize",
2259 	"Dir renamed",
2260 	"Falloc range op",
2261 	"Data journalling",
2262 	"FC Commit Failed"
2263 };
2264 
ext4_fc_info_show(struct seq_file * seq,void * v)2265 int ext4_fc_info_show(struct seq_file *seq, void *v)
2266 {
2267 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2268 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2269 	int i;
2270 
2271 	if (v != SEQ_START_TOKEN)
2272 		return 0;
2273 
2274 	seq_printf(seq,
2275 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2276 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2277 		   stats->fc_numblks,
2278 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2279 	seq_puts(seq, "Ineligible reasons:\n");
2280 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2281 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2282 			stats->fc_ineligible_reason_count[i]);
2283 
2284 	return 0;
2285 }
2286 
ext4_fc_init_dentry_cache(void)2287 int __init ext4_fc_init_dentry_cache(void)
2288 {
2289 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2290 					   SLAB_RECLAIM_ACCOUNT);
2291 
2292 	if (ext4_fc_dentry_cachep == NULL)
2293 		return -ENOMEM;
2294 
2295 	return 0;
2296 }
2297 
ext4_fc_destroy_dentry_cache(void)2298 void ext4_fc_destroy_dentry_cache(void)
2299 {
2300 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2301 }
2302