1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligibility is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * Fast Commit Replay Idempotence
107 * ------------------------------
108 *
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
113 *
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
118 *
119 * - Link dirent b to inode 10
120 * - Unlink dirent a
121 * - Inode <10> with valid refcount
122 *
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
125 *
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
128 *
129 * rm A; mv B A; read A
130 * (x) (y) (z)
131 *
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
140 *
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
143 *
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
145 * (w) (x) (y) (z)
146 *
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
154 *
155 * TODOs
156 * -----
157 *
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
167 * commits.
168 *
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
174 *
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176 *
177 * 3) Handle more ineligible cases.
178 */
179
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185 BUFFER_TRACE(bh, "");
186 if (uptodate) {
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__, bh->b_blocknr);
189 set_buffer_uptodate(bh);
190 } else {
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__, bh->b_blocknr);
193 clear_buffer_uptodate(bh);
194 }
195
196 unlock_buffer(bh);
197 }
198
ext4_fc_reset_inode(struct inode * inode)199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201 struct ext4_inode_info *ei = EXT4_I(inode);
202
203 ei->i_fc_lblk_start = 0;
204 ei->i_fc_lblk_len = 0;
205 }
206
ext4_fc_init_inode(struct inode * inode)207 void ext4_fc_init_inode(struct inode *inode)
208 {
209 struct ext4_inode_info *ei = EXT4_I(inode);
210
211 ext4_fc_reset_inode(inode);
212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 INIT_LIST_HEAD(&ei->i_fc_list);
214 init_waitqueue_head(&ei->i_fc_wait);
215 atomic_set(&ei->i_fc_updates, 0);
216 }
217
218 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222 wait_queue_head_t *wq;
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
225 #if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 EXT4_STATE_FC_COMMITTING);
228 wq = bit_waitqueue(&ei->i_state_flags,
229 EXT4_STATE_FC_COMMITTING);
230 #else
231 DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 EXT4_STATE_FC_COMMITTING);
233 wq = bit_waitqueue(&ei->i_flags,
234 EXT4_STATE_FC_COMMITTING);
235 #endif
236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 schedule();
240 finish_wait(wq, &wait.wq_entry);
241 }
242
243 /*
244 * Inform Ext4's fast about start of an inode update
245 *
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
249 */
ext4_fc_start_update(struct inode * inode)250 void ext4_fc_start_update(struct inode *inode)
251 {
252 struct ext4_inode_info *ei = EXT4_I(inode);
253
254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 return;
257
258 restart:
259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 if (list_empty(&ei->i_fc_list))
261 goto out;
262
263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 ext4_fc_wait_committing_inode(inode);
265 goto restart;
266 }
267 out:
268 atomic_inc(&ei->i_fc_updates);
269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271
272 /*
273 * Stop inode update and wake up waiting fast commits if any.
274 */
ext4_fc_stop_update(struct inode * inode)275 void ext4_fc_stop_update(struct inode *inode)
276 {
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 return;
282
283 if (atomic_dec_and_test(&ei->i_fc_updates))
284 wake_up_all(&ei->i_fc_wait);
285 }
286
287 /*
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
290 */
ext4_fc_del(struct inode * inode)291 void ext4_fc_del(struct inode *inode)
292 {
293 struct ext4_inode_info *ei = EXT4_I(inode);
294
295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 return;
298
299 restart:
300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 if (list_empty(&ei->i_fc_list)) {
302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 return;
304 }
305
306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 ext4_fc_wait_committing_inode(inode);
308 goto restart;
309 }
310 list_del_init(&ei->i_fc_list);
311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313
314 /*
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
317 */
ext4_fc_mark_ineligible(struct super_block * sb,int reason)318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320 struct ext4_sb_info *sbi = EXT4_SB(sb);
321
322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 return;
325
326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330
331 /*
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
334 */
ext4_fc_start_ineligible(struct super_block * sb,int reason)335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
338
339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 return;
342
343 WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347
348 /*
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
352 */
ext4_fc_stop_ineligible(struct super_block * sb)353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 return;
358
359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362
ext4_fc_is_ineligible(struct super_block * sb)363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368
369 /*
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
376 *
377 * If enqueue is set, this function enqueues the inode in fast commit list.
378 */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)379 static int ext4_fc_track_template(
380 handle_t *handle, struct inode *inode,
381 int (*__fc_track_fn)(struct inode *, void *, bool),
382 void *args, int enqueue)
383 {
384 bool update = false;
385 struct ext4_inode_info *ei = EXT4_I(inode);
386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 tid_t tid = 0;
388 int ret;
389
390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 (sbi->s_mount_state & EXT4_FC_REPLAY))
392 return -EOPNOTSUPP;
393
394 if (ext4_fc_is_ineligible(inode->i_sb))
395 return -EINVAL;
396
397 tid = handle->h_transaction->t_tid;
398 mutex_lock(&ei->i_fc_lock);
399 if (tid == ei->i_sync_tid) {
400 update = true;
401 } else {
402 ext4_fc_reset_inode(inode);
403 ei->i_sync_tid = tid;
404 }
405 ret = __fc_track_fn(inode, args, update);
406 mutex_unlock(&ei->i_fc_lock);
407
408 if (!enqueue)
409 return ret;
410
411 spin_lock(&sbi->s_fc_lock);
412 if (list_empty(&EXT4_I(inode)->i_fc_list))
413 list_add_tail(&EXT4_I(inode)->i_fc_list,
414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 &sbi->s_fc_q[FC_Q_STAGING] :
416 &sbi->s_fc_q[FC_Q_MAIN]);
417 spin_unlock(&sbi->s_fc_lock);
418
419 return ret;
420 }
421
422 struct __track_dentry_update_args {
423 struct dentry *dentry;
424 int op;
425 };
426
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430 struct ext4_fc_dentry_update *node;
431 struct ext4_inode_info *ei = EXT4_I(inode);
432 struct __track_dentry_update_args *dentry_update =
433 (struct __track_dentry_update_args *)arg;
434 struct dentry *dentry = dentry_update->dentry;
435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436
437 mutex_unlock(&ei->i_fc_lock);
438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 if (!node) {
440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 mutex_lock(&ei->i_fc_lock);
442 return -ENOMEM;
443 }
444
445 node->fcd_op = dentry_update->op;
446 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 node->fcd_ino = inode->i_ino;
448 if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 if (!node->fcd_name.name) {
451 kmem_cache_free(ext4_fc_dentry_cachep, node);
452 ext4_fc_mark_ineligible(inode->i_sb,
453 EXT4_FC_REASON_NOMEM);
454 mutex_lock(&ei->i_fc_lock);
455 return -ENOMEM;
456 }
457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 dentry->d_name.len);
459 } else {
460 memcpy(node->fcd_iname, dentry->d_name.name,
461 dentry->d_name.len);
462 node->fcd_name.name = node->fcd_iname;
463 }
464 node->fcd_name.len = dentry->d_name.len;
465
466 spin_lock(&sbi->s_fc_lock);
467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 list_add_tail(&node->fcd_list,
469 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 else
471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 spin_unlock(&sbi->s_fc_lock);
473 mutex_lock(&ei->i_fc_lock);
474
475 return 0;
476 }
477
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)478 void __ext4_fc_track_unlink(handle_t *handle,
479 struct inode *inode, struct dentry *dentry)
480 {
481 struct __track_dentry_update_args args;
482 int ret;
483
484 args.dentry = dentry;
485 args.op = EXT4_FC_TAG_UNLINK;
486
487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 (void *)&args, 0);
489 trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)497 void __ext4_fc_track_link(handle_t *handle,
498 struct inode *inode, struct dentry *dentry)
499 {
500 struct __track_dentry_update_args args;
501 int ret;
502
503 args.dentry = dentry;
504 args.op = EXT4_FC_TAG_LINK;
505
506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 (void *)&args, 0);
508 trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 struct dentry *dentry)
518 {
519 struct __track_dentry_update_args args;
520 int ret;
521
522 args.dentry = dentry;
523 args.op = EXT4_FC_TAG_CREAT;
524
525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 (void *)&args, 0);
527 trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531 {
532 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
533 }
534
535 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)536 static int __track_inode(struct inode *inode, void *arg, bool update)
537 {
538 if (update)
539 return -EEXIST;
540
541 EXT4_I(inode)->i_fc_lblk_len = 0;
542
543 return 0;
544 }
545
ext4_fc_track_inode(handle_t * handle,struct inode * inode)546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547 {
548 int ret;
549
550 if (S_ISDIR(inode->i_mode))
551 return;
552
553 if (ext4_should_journal_data(inode)) {
554 ext4_fc_mark_ineligible(inode->i_sb,
555 EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 return;
557 }
558
559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 trace_ext4_fc_track_inode(inode, ret);
561 }
562
563 struct __track_range_args {
564 ext4_lblk_t start, end;
565 };
566
567 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)568 static int __track_range(struct inode *inode, void *arg, bool update)
569 {
570 struct ext4_inode_info *ei = EXT4_I(inode);
571 ext4_lblk_t oldstart;
572 struct __track_range_args *__arg =
573 (struct __track_range_args *)arg;
574
575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 return -ECANCELED;
578 }
579
580 oldstart = ei->i_fc_lblk_start;
581
582 if (update && ei->i_fc_lblk_len > 0) {
583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 ei->i_fc_lblk_len =
585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 ei->i_fc_lblk_start + 1;
587 } else {
588 ei->i_fc_lblk_start = __arg->start;
589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 }
591
592 return 0;
593 }
594
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 ext4_lblk_t end)
597 {
598 struct __track_range_args args;
599 int ret;
600
601 if (S_ISDIR(inode->i_mode))
602 return;
603
604 args.start = start;
605 args.end = end;
606
607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
608
609 trace_ext4_fc_track_range(inode, start, end, ret);
610 }
611
ext4_fc_submit_bh(struct super_block * sb,bool is_tail)612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613 {
614 int write_flags = REQ_SYNC;
615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616
617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 if (test_opt(sb, BARRIER) && is_tail)
619 write_flags |= REQ_FUA | REQ_PREFLUSH;
620 lock_buffer(bh);
621 set_buffer_dirty(bh);
622 set_buffer_uptodate(bh);
623 bh->b_end_io = ext4_end_buffer_io_sync;
624 submit_bh(REQ_OP_WRITE, write_flags, bh);
625 EXT4_SB(sb)->s_fc_bh = NULL;
626 }
627
628 /* Ext4 commit path routines */
629
630 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 u32 *crc)
633 {
634 void *ret;
635
636 ret = memset(dst, 0, len);
637 if (crc)
638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 return ret;
640 }
641
642 /*
643 * Allocate len bytes on a fast commit buffer.
644 *
645 * During the commit time this function is used to manage fast commit
646 * block space. We don't split a fast commit log onto different
647 * blocks. So this function makes sure that if there's not enough space
648 * on the current block, the remaining space in the current block is
649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650 * new block is from jbd2 and CRC is updated to reflect the padding
651 * we added.
652 */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654 {
655 struct ext4_fc_tl *tl;
656 struct ext4_sb_info *sbi = EXT4_SB(sb);
657 struct buffer_head *bh;
658 int bsize = sbi->s_journal->j_blocksize;
659 int ret, off = sbi->s_fc_bytes % bsize;
660 int pad_len;
661
662 /*
663 * After allocating len, we should have space at least for a 0 byte
664 * padding.
665 */
666 if (len + sizeof(struct ext4_fc_tl) > bsize)
667 return NULL;
668
669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 /*
671 * Only allocate from current buffer if we have enough space for
672 * this request AND we have space to add a zero byte padding.
673 */
674 if (!sbi->s_fc_bh) {
675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 if (ret)
677 return NULL;
678 sbi->s_fc_bh = bh;
679 }
680 sbi->s_fc_bytes += len;
681 return sbi->s_fc_bh->b_data + off;
682 }
683 /* Need to add PAD tag */
684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 tl->fc_len = cpu_to_le16(pad_len);
688 if (crc)
689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 if (pad_len > 0)
691 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 ext4_fc_submit_bh(sb, false);
693
694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 if (ret)
696 return NULL;
697 sbi->s_fc_bh = bh;
698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 return sbi->s_fc_bh->b_data;
700 }
701
702 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 int len, u32 *crc)
705 {
706 if (crc)
707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 return memcpy(dst, src, len);
709 }
710
711 /*
712 * Complete a fast commit by writing tail tag.
713 *
714 * Writing tail tag marks the end of a fast commit. In order to guarantee
715 * atomicity, after writing tail tag, even if there's space remaining
716 * in the block, next commit shouldn't use it. That's why tail tag
717 * has the length as that of the remaining space on the block.
718 */
ext4_fc_write_tail(struct super_block * sb,u32 crc)719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720 {
721 struct ext4_sb_info *sbi = EXT4_SB(sb);
722 struct ext4_fc_tl tl;
723 struct ext4_fc_tail tail;
724 int off, bsize = sbi->s_journal->j_blocksize;
725 u8 *dst;
726
727 /*
728 * ext4_fc_reserve_space takes care of allocating an extra block if
729 * there's no enough space on this block for accommodating this tail.
730 */
731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 if (!dst)
733 return -ENOSPC;
734
735 off = sbi->s_fc_bytes % bsize;
736
737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740
741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 dst += sizeof(tl);
743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 dst += sizeof(tail.fc_tid);
746 tail.fc_crc = cpu_to_le32(crc);
747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748
749 ext4_fc_submit_bh(sb, true);
750
751 return 0;
752 }
753
754 /*
755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756 * Returns false if there's not enough space.
757 */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 u32 *crc)
760 {
761 struct ext4_fc_tl tl;
762 u8 *dst;
763
764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 if (!dst)
766 return false;
767
768 tl.fc_tag = cpu_to_le16(tag);
769 tl.fc_len = cpu_to_le16(len);
770
771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773
774 return true;
775 }
776
777 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u32 * crc,struct ext4_fc_dentry_update * fc_dentry)778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
779 struct ext4_fc_dentry_update *fc_dentry)
780 {
781 struct ext4_fc_dentry_info fcd;
782 struct ext4_fc_tl tl;
783 int dlen = fc_dentry->fcd_name.len;
784 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
785 crc);
786
787 if (!dst)
788 return false;
789
790 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
791 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
792 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
793 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
794 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
795 dst += sizeof(tl);
796 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
797 dst += sizeof(fcd);
798 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
799 dst += dlen;
800
801 return true;
802 }
803
804 /*
805 * Writes inode in the fast commit space under TLV with tag @tag.
806 * Returns 0 on success, error on failure.
807 */
ext4_fc_write_inode(struct inode * inode,u32 * crc)808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
809 {
810 struct ext4_inode_info *ei = EXT4_I(inode);
811 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
812 int ret;
813 struct ext4_iloc iloc;
814 struct ext4_fc_inode fc_inode;
815 struct ext4_fc_tl tl;
816 u8 *dst;
817
818 ret = ext4_get_inode_loc(inode, &iloc);
819 if (ret)
820 return ret;
821
822 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
823 inode_len += ei->i_extra_isize;
824
825 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
826 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
827 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
828
829 dst = ext4_fc_reserve_space(inode->i_sb,
830 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
831 if (!dst)
832 return -ECANCELED;
833
834 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
835 return -ECANCELED;
836 dst += sizeof(tl);
837 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
838 return -ECANCELED;
839 dst += sizeof(fc_inode);
840 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
841 inode_len, crc))
842 return -ECANCELED;
843
844 return 0;
845 }
846
847 /*
848 * Writes updated data ranges for the inode in question. Updates CRC.
849 * Returns 0 on success, error otherwise.
850 */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)851 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
852 {
853 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
854 struct ext4_inode_info *ei = EXT4_I(inode);
855 struct ext4_map_blocks map;
856 struct ext4_fc_add_range fc_ext;
857 struct ext4_fc_del_range lrange;
858 struct ext4_extent *ex;
859 int ret;
860
861 mutex_lock(&ei->i_fc_lock);
862 if (ei->i_fc_lblk_len == 0) {
863 mutex_unlock(&ei->i_fc_lock);
864 return 0;
865 }
866 old_blk_size = ei->i_fc_lblk_start;
867 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
868 ei->i_fc_lblk_len = 0;
869 mutex_unlock(&ei->i_fc_lock);
870
871 cur_lblk_off = old_blk_size;
872 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
873 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
874
875 while (cur_lblk_off <= new_blk_size) {
876 map.m_lblk = cur_lblk_off;
877 map.m_len = new_blk_size - cur_lblk_off + 1;
878 ret = ext4_map_blocks(NULL, inode, &map, 0);
879 if (ret < 0)
880 return -ECANCELED;
881
882 if (map.m_len == 0) {
883 cur_lblk_off++;
884 continue;
885 }
886
887 if (ret == 0) {
888 lrange.fc_ino = cpu_to_le32(inode->i_ino);
889 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
890 lrange.fc_len = cpu_to_le32(map.m_len);
891 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
892 sizeof(lrange), (u8 *)&lrange, crc))
893 return -ENOSPC;
894 } else {
895 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
896 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
897
898 /* Limit the number of blocks in one extent */
899 map.m_len = min(max, map.m_len);
900
901 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
902 ex = (struct ext4_extent *)&fc_ext.fc_ex;
903 ex->ee_block = cpu_to_le32(map.m_lblk);
904 ex->ee_len = cpu_to_le16(map.m_len);
905 ext4_ext_store_pblock(ex, map.m_pblk);
906 if (map.m_flags & EXT4_MAP_UNWRITTEN)
907 ext4_ext_mark_unwritten(ex);
908 else
909 ext4_ext_mark_initialized(ex);
910 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
911 sizeof(fc_ext), (u8 *)&fc_ext, crc))
912 return -ENOSPC;
913 }
914
915 cur_lblk_off += map.m_len;
916 }
917
918 return 0;
919 }
920
921
922 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)923 static int ext4_fc_submit_inode_data_all(journal_t *journal)
924 {
925 struct super_block *sb = (struct super_block *)(journal->j_private);
926 struct ext4_sb_info *sbi = EXT4_SB(sb);
927 struct ext4_inode_info *ei;
928 int ret = 0;
929
930 spin_lock(&sbi->s_fc_lock);
931 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
932 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
933 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
934 while (atomic_read(&ei->i_fc_updates)) {
935 DEFINE_WAIT(wait);
936
937 prepare_to_wait(&ei->i_fc_wait, &wait,
938 TASK_UNINTERRUPTIBLE);
939 if (atomic_read(&ei->i_fc_updates)) {
940 spin_unlock(&sbi->s_fc_lock);
941 schedule();
942 spin_lock(&sbi->s_fc_lock);
943 }
944 finish_wait(&ei->i_fc_wait, &wait);
945 }
946 spin_unlock(&sbi->s_fc_lock);
947 ret = jbd2_submit_inode_data(ei->jinode);
948 if (ret)
949 return ret;
950 spin_lock(&sbi->s_fc_lock);
951 }
952 spin_unlock(&sbi->s_fc_lock);
953
954 return ret;
955 }
956
957 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)958 static int ext4_fc_wait_inode_data_all(journal_t *journal)
959 {
960 struct super_block *sb = (struct super_block *)(journal->j_private);
961 struct ext4_sb_info *sbi = EXT4_SB(sb);
962 struct ext4_inode_info *pos, *n;
963 int ret = 0;
964
965 spin_lock(&sbi->s_fc_lock);
966 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
967 if (!ext4_test_inode_state(&pos->vfs_inode,
968 EXT4_STATE_FC_COMMITTING))
969 continue;
970 spin_unlock(&sbi->s_fc_lock);
971
972 ret = jbd2_wait_inode_data(journal, pos->jinode);
973 if (ret)
974 return ret;
975 spin_lock(&sbi->s_fc_lock);
976 }
977 spin_unlock(&sbi->s_fc_lock);
978
979 return 0;
980 }
981
982 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)983 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
984 __acquires(&sbi->s_fc_lock)
985 __releases(&sbi->s_fc_lock)
986 {
987 struct super_block *sb = (struct super_block *)(journal->j_private);
988 struct ext4_sb_info *sbi = EXT4_SB(sb);
989 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
990 struct inode *inode;
991 struct ext4_inode_info *ei, *ei_n;
992 int ret;
993
994 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
995 return 0;
996 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
997 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
998 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
999 spin_unlock(&sbi->s_fc_lock);
1000 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1001 ret = -ENOSPC;
1002 goto lock_and_exit;
1003 }
1004 spin_lock(&sbi->s_fc_lock);
1005 continue;
1006 }
1007
1008 inode = NULL;
1009 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1010 i_fc_list) {
1011 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1012 inode = &ei->vfs_inode;
1013 break;
1014 }
1015 }
1016 /*
1017 * If we don't find inode in our list, then it was deleted,
1018 * in which case, we don't need to record it's create tag.
1019 */
1020 if (!inode)
1021 continue;
1022 spin_unlock(&sbi->s_fc_lock);
1023
1024 /*
1025 * We first write the inode and then the create dirent. This
1026 * allows the recovery code to create an unnamed inode first
1027 * and then link it to a directory entry. This allows us
1028 * to use namei.c routines almost as is and simplifies
1029 * the recovery code.
1030 */
1031 ret = ext4_fc_write_inode(inode, crc);
1032 if (ret)
1033 goto lock_and_exit;
1034
1035 ret = ext4_fc_write_inode_data(inode, crc);
1036 if (ret)
1037 goto lock_and_exit;
1038
1039 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1040 ret = -ENOSPC;
1041 goto lock_and_exit;
1042 }
1043
1044 spin_lock(&sbi->s_fc_lock);
1045 }
1046 return 0;
1047 lock_and_exit:
1048 spin_lock(&sbi->s_fc_lock);
1049 return ret;
1050 }
1051
ext4_fc_perform_commit(journal_t * journal)1052 static int ext4_fc_perform_commit(journal_t *journal)
1053 {
1054 struct super_block *sb = (struct super_block *)(journal->j_private);
1055 struct ext4_sb_info *sbi = EXT4_SB(sb);
1056 struct ext4_inode_info *iter;
1057 struct ext4_fc_head head;
1058 struct inode *inode;
1059 struct blk_plug plug;
1060 int ret = 0;
1061 u32 crc = 0;
1062
1063 ret = ext4_fc_submit_inode_data_all(journal);
1064 if (ret)
1065 return ret;
1066
1067 ret = ext4_fc_wait_inode_data_all(journal);
1068 if (ret)
1069 return ret;
1070
1071 /*
1072 * If file system device is different from journal device, issue a cache
1073 * flush before we start writing fast commit blocks.
1074 */
1075 if (journal->j_fs_dev != journal->j_dev)
1076 blkdev_issue_flush(journal->j_fs_dev);
1077
1078 blk_start_plug(&plug);
1079 if (sbi->s_fc_bytes == 0) {
1080 /*
1081 * Add a head tag only if this is the first fast commit
1082 * in this TID.
1083 */
1084 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1085 head.fc_tid = cpu_to_le32(
1086 sbi->s_journal->j_running_transaction->t_tid);
1087 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1088 (u8 *)&head, &crc)) {
1089 ret = -ENOSPC;
1090 goto out;
1091 }
1092 }
1093
1094 spin_lock(&sbi->s_fc_lock);
1095 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1096 if (ret) {
1097 spin_unlock(&sbi->s_fc_lock);
1098 goto out;
1099 }
1100
1101 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1102 inode = &iter->vfs_inode;
1103 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1104 continue;
1105
1106 spin_unlock(&sbi->s_fc_lock);
1107 ret = ext4_fc_write_inode_data(inode, &crc);
1108 if (ret)
1109 goto out;
1110 ret = ext4_fc_write_inode(inode, &crc);
1111 if (ret)
1112 goto out;
1113 spin_lock(&sbi->s_fc_lock);
1114 }
1115 spin_unlock(&sbi->s_fc_lock);
1116
1117 ret = ext4_fc_write_tail(sb, crc);
1118
1119 out:
1120 blk_finish_plug(&plug);
1121 return ret;
1122 }
1123
1124 /*
1125 * The main commit entry point. Performs a fast commit for transaction
1126 * commit_tid if needed. If it's not possible to perform a fast commit
1127 * due to various reasons, we fall back to full commit. Returns 0
1128 * on success, error otherwise.
1129 */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1130 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1131 {
1132 struct super_block *sb = (struct super_block *)(journal->j_private);
1133 struct ext4_sb_info *sbi = EXT4_SB(sb);
1134 int nblks = 0, ret, bsize = journal->j_blocksize;
1135 int subtid = atomic_read(&sbi->s_fc_subtid);
1136 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1137 ktime_t start_time, commit_time;
1138
1139 trace_ext4_fc_commit_start(sb);
1140
1141 start_time = ktime_get();
1142
1143 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1144 (ext4_fc_is_ineligible(sb))) {
1145 reason = EXT4_FC_REASON_INELIGIBLE;
1146 goto out;
1147 }
1148
1149 restart_fc:
1150 ret = jbd2_fc_begin_commit(journal, commit_tid);
1151 if (ret == -EALREADY) {
1152 /* There was an ongoing commit, check if we need to restart */
1153 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1154 commit_tid > journal->j_commit_sequence)
1155 goto restart_fc;
1156 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1157 goto out;
1158 } else if (ret) {
1159 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1160 reason = EXT4_FC_REASON_FC_START_FAILED;
1161 goto out;
1162 }
1163
1164 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1165 ret = ext4_fc_perform_commit(journal);
1166 if (ret < 0) {
1167 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1168 reason = EXT4_FC_REASON_FC_FAILED;
1169 goto out;
1170 }
1171 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172 ret = jbd2_fc_wait_bufs(journal, nblks);
1173 if (ret < 0) {
1174 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1175 reason = EXT4_FC_REASON_FC_FAILED;
1176 goto out;
1177 }
1178 atomic_inc(&sbi->s_fc_subtid);
1179 jbd2_fc_end_commit(journal);
1180 out:
1181 /* Has any ineligible update happened since we started? */
1182 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1183 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184 reason = EXT4_FC_REASON_INELIGIBLE;
1185 }
1186
1187 spin_lock(&sbi->s_fc_lock);
1188 if (reason != EXT4_FC_REASON_OK &&
1189 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1190 sbi->s_fc_stats.fc_ineligible_commits++;
1191 } else {
1192 sbi->s_fc_stats.fc_num_commits++;
1193 sbi->s_fc_stats.fc_numblks += nblks;
1194 }
1195 spin_unlock(&sbi->s_fc_lock);
1196 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1197 trace_ext4_fc_commit_stop(sb, nblks, reason);
1198 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1199 /*
1200 * weight the commit time higher than the average time so we don't
1201 * react too strongly to vast changes in the commit time
1202 */
1203 if (likely(sbi->s_fc_avg_commit_time))
1204 sbi->s_fc_avg_commit_time = (commit_time +
1205 sbi->s_fc_avg_commit_time * 3) / 4;
1206 else
1207 sbi->s_fc_avg_commit_time = commit_time;
1208 jbd_debug(1,
1209 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1210 nblks, reason, subtid);
1211 if (reason == EXT4_FC_REASON_FC_FAILED)
1212 return jbd2_fc_end_commit_fallback(journal);
1213 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1214 reason == EXT4_FC_REASON_INELIGIBLE)
1215 return jbd2_complete_transaction(journal, commit_tid);
1216 return 0;
1217 }
1218
1219 /*
1220 * Fast commit cleanup routine. This is called after every fast commit and
1221 * full commit. full is true if we are called after a full commit.
1222 */
ext4_fc_cleanup(journal_t * journal,int full)1223 static void ext4_fc_cleanup(journal_t *journal, int full)
1224 {
1225 struct super_block *sb = journal->j_private;
1226 struct ext4_sb_info *sbi = EXT4_SB(sb);
1227 struct ext4_inode_info *iter, *iter_n;
1228 struct ext4_fc_dentry_update *fc_dentry;
1229
1230 if (full && sbi->s_fc_bh)
1231 sbi->s_fc_bh = NULL;
1232
1233 jbd2_fc_release_bufs(journal);
1234
1235 spin_lock(&sbi->s_fc_lock);
1236 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1237 i_fc_list) {
1238 list_del_init(&iter->i_fc_list);
1239 ext4_clear_inode_state(&iter->vfs_inode,
1240 EXT4_STATE_FC_COMMITTING);
1241 ext4_fc_reset_inode(&iter->vfs_inode);
1242 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1243 smp_mb();
1244 #if (BITS_PER_LONG < 64)
1245 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1246 #else
1247 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1248 #endif
1249 }
1250
1251 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1252 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1253 struct ext4_fc_dentry_update,
1254 fcd_list);
1255 list_del_init(&fc_dentry->fcd_list);
1256 spin_unlock(&sbi->s_fc_lock);
1257
1258 if (fc_dentry->fcd_name.name &&
1259 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1260 kfree(fc_dentry->fcd_name.name);
1261 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1262 spin_lock(&sbi->s_fc_lock);
1263 }
1264
1265 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1266 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1267 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1268 &sbi->s_fc_q[FC_Q_MAIN]);
1269
1270 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1271 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1272
1273 if (full)
1274 sbi->s_fc_bytes = 0;
1275 spin_unlock(&sbi->s_fc_lock);
1276 trace_ext4_fc_stats(sb);
1277 }
1278
1279 /* Ext4 Replay Path Routines */
1280
1281 /* Helper struct for dentry replay routines */
1282 struct dentry_info_args {
1283 int parent_ino, dname_len, ino, inode_len;
1284 char *dname;
1285 };
1286
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl,u8 * val)1287 static inline void tl_to_darg(struct dentry_info_args *darg,
1288 struct ext4_fc_tl *tl, u8 *val)
1289 {
1290 struct ext4_fc_dentry_info fcd;
1291
1292 memcpy(&fcd, val, sizeof(fcd));
1293
1294 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1295 darg->ino = le32_to_cpu(fcd.fc_ino);
1296 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1297 darg->dname_len = le16_to_cpu(tl->fc_len) -
1298 sizeof(struct ext4_fc_dentry_info);
1299 }
1300
1301 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1302 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1303 u8 *val)
1304 {
1305 struct inode *inode, *old_parent;
1306 struct qstr entry;
1307 struct dentry_info_args darg;
1308 int ret = 0;
1309
1310 tl_to_darg(&darg, tl, val);
1311
1312 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313 darg.parent_ino, darg.dname_len);
1314
1315 entry.name = darg.dname;
1316 entry.len = darg.dname_len;
1317 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318
1319 if (IS_ERR(inode)) {
1320 jbd_debug(1, "Inode %d not found", darg.ino);
1321 return 0;
1322 }
1323
1324 old_parent = ext4_iget(sb, darg.parent_ino,
1325 EXT4_IGET_NORMAL);
1326 if (IS_ERR(old_parent)) {
1327 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1328 iput(inode);
1329 return 0;
1330 }
1331
1332 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333 /* -ENOENT ok coz it might not exist anymore. */
1334 if (ret == -ENOENT)
1335 ret = 0;
1336 iput(old_parent);
1337 iput(inode);
1338 return ret;
1339 }
1340
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1341 static int ext4_fc_replay_link_internal(struct super_block *sb,
1342 struct dentry_info_args *darg,
1343 struct inode *inode)
1344 {
1345 struct inode *dir = NULL;
1346 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348 int ret = 0;
1349
1350 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351 if (IS_ERR(dir)) {
1352 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353 dir = NULL;
1354 goto out;
1355 }
1356
1357 dentry_dir = d_obtain_alias(dir);
1358 if (IS_ERR(dentry_dir)) {
1359 jbd_debug(1, "Failed to obtain dentry");
1360 dentry_dir = NULL;
1361 goto out;
1362 }
1363
1364 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365 if (!dentry_inode) {
1366 jbd_debug(1, "Inode dentry not created.");
1367 ret = -ENOMEM;
1368 goto out;
1369 }
1370
1371 ret = __ext4_link(dir, inode, dentry_inode);
1372 /*
1373 * It's possible that link already existed since data blocks
1374 * for the dir in question got persisted before we crashed OR
1375 * we replayed this tag and crashed before the entire replay
1376 * could complete.
1377 */
1378 if (ret && ret != -EEXIST) {
1379 jbd_debug(1, "Failed to link\n");
1380 goto out;
1381 }
1382
1383 ret = 0;
1384 out:
1385 if (dentry_dir) {
1386 d_drop(dentry_dir);
1387 dput(dentry_dir);
1388 } else if (dir) {
1389 iput(dir);
1390 }
1391 if (dentry_inode) {
1392 d_drop(dentry_inode);
1393 dput(dentry_inode);
1394 }
1395
1396 return ret;
1397 }
1398
1399 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1400 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1401 u8 *val)
1402 {
1403 struct inode *inode;
1404 struct dentry_info_args darg;
1405 int ret = 0;
1406
1407 tl_to_darg(&darg, tl, val);
1408 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1409 darg.parent_ino, darg.dname_len);
1410
1411 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1412 if (IS_ERR(inode)) {
1413 jbd_debug(1, "Inode not found.");
1414 return 0;
1415 }
1416
1417 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1418 iput(inode);
1419 return ret;
1420 }
1421
1422 /*
1423 * Record all the modified inodes during replay. We use this later to setup
1424 * block bitmaps correctly.
1425 */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1426 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1427 {
1428 struct ext4_fc_replay_state *state;
1429 int i;
1430
1431 state = &EXT4_SB(sb)->s_fc_replay_state;
1432 for (i = 0; i < state->fc_modified_inodes_used; i++)
1433 if (state->fc_modified_inodes[i] == ino)
1434 return 0;
1435 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1436 state->fc_modified_inodes_size +=
1437 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1438 state->fc_modified_inodes = krealloc(
1439 state->fc_modified_inodes, sizeof(int) *
1440 state->fc_modified_inodes_size,
1441 GFP_KERNEL);
1442 if (!state->fc_modified_inodes)
1443 return -ENOMEM;
1444 }
1445 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1446 return 0;
1447 }
1448
1449 /*
1450 * Inode replay function
1451 */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1452 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1453 u8 *val)
1454 {
1455 struct ext4_fc_inode fc_inode;
1456 struct ext4_inode *raw_inode;
1457 struct ext4_inode *raw_fc_inode;
1458 struct inode *inode = NULL;
1459 struct ext4_iloc iloc;
1460 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461 struct ext4_extent_header *eh;
1462
1463 memcpy(&fc_inode, val, sizeof(fc_inode));
1464
1465 ino = le32_to_cpu(fc_inode.fc_ino);
1466 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469 if (!IS_ERR(inode)) {
1470 ext4_ext_clear_bb(inode);
1471 iput(inode);
1472 }
1473 inode = NULL;
1474
1475 ext4_fc_record_modified_inode(sb, ino);
1476
1477 raw_fc_inode = (struct ext4_inode *)
1478 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1479 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1480 if (ret)
1481 goto out;
1482
1483 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1484 raw_inode = ext4_raw_inode(&iloc);
1485
1486 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1487 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1488 inode_len - offsetof(struct ext4_inode, i_generation));
1489 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1490 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1491 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1492 memset(eh, 0, sizeof(*eh));
1493 eh->eh_magic = EXT4_EXT_MAGIC;
1494 eh->eh_max = cpu_to_le16(
1495 (sizeof(raw_inode->i_block) -
1496 sizeof(struct ext4_extent_header))
1497 / sizeof(struct ext4_extent));
1498 }
1499 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1500 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1501 sizeof(raw_inode->i_block));
1502 }
1503
1504 /* Immediately update the inode on disk. */
1505 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1506 if (ret)
1507 goto out;
1508 ret = sync_dirty_buffer(iloc.bh);
1509 if (ret)
1510 goto out;
1511 ret = ext4_mark_inode_used(sb, ino);
1512 if (ret)
1513 goto out;
1514
1515 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1516 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1517 if (IS_ERR(inode)) {
1518 jbd_debug(1, "Inode not found.");
1519 return -EFSCORRUPTED;
1520 }
1521
1522 /*
1523 * Our allocator could have made different decisions than before
1524 * crashing. This should be fixed but until then, we calculate
1525 * the number of blocks the inode.
1526 */
1527 ext4_ext_replay_set_iblocks(inode);
1528
1529 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1530 ext4_reset_inode_seed(inode);
1531
1532 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1533 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1534 sync_dirty_buffer(iloc.bh);
1535 brelse(iloc.bh);
1536 out:
1537 iput(inode);
1538 if (!ret)
1539 blkdev_issue_flush(sb->s_bdev);
1540
1541 return 0;
1542 }
1543
1544 /*
1545 * Dentry create replay function.
1546 *
1547 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1548 * inode for which we are trying to create a dentry here, should already have
1549 * been replayed before we start here.
1550 */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1551 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1552 u8 *val)
1553 {
1554 int ret = 0;
1555 struct inode *inode = NULL;
1556 struct inode *dir = NULL;
1557 struct dentry_info_args darg;
1558
1559 tl_to_darg(&darg, tl, val);
1560
1561 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1562 darg.parent_ino, darg.dname_len);
1563
1564 /* This takes care of update group descriptor and other metadata */
1565 ret = ext4_mark_inode_used(sb, darg.ino);
1566 if (ret)
1567 goto out;
1568
1569 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1570 if (IS_ERR(inode)) {
1571 jbd_debug(1, "inode %d not found.", darg.ino);
1572 inode = NULL;
1573 ret = -EINVAL;
1574 goto out;
1575 }
1576
1577 if (S_ISDIR(inode->i_mode)) {
1578 /*
1579 * If we are creating a directory, we need to make sure that the
1580 * dot and dot dot dirents are setup properly.
1581 */
1582 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1583 if (IS_ERR(dir)) {
1584 jbd_debug(1, "Dir %d not found.", darg.ino);
1585 goto out;
1586 }
1587 ret = ext4_init_new_dir(NULL, dir, inode);
1588 iput(dir);
1589 if (ret) {
1590 ret = 0;
1591 goto out;
1592 }
1593 }
1594 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1595 if (ret)
1596 goto out;
1597 set_nlink(inode, 1);
1598 ext4_mark_inode_dirty(NULL, inode);
1599 out:
1600 if (inode)
1601 iput(inode);
1602 return ret;
1603 }
1604
1605 /*
1606 * Record physical disk regions which are in use as per fast commit area. Our
1607 * simple replay phase allocator excludes these regions from allocation.
1608 */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len)1609 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1610 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1611 {
1612 struct ext4_fc_replay_state *state;
1613 struct ext4_fc_alloc_region *region;
1614
1615 state = &EXT4_SB(sb)->s_fc_replay_state;
1616 if (state->fc_regions_used == state->fc_regions_size) {
1617 state->fc_regions_size +=
1618 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1619 state->fc_regions = krealloc(
1620 state->fc_regions,
1621 state->fc_regions_size *
1622 sizeof(struct ext4_fc_alloc_region),
1623 GFP_KERNEL);
1624 if (!state->fc_regions)
1625 return -ENOMEM;
1626 }
1627 region = &state->fc_regions[state->fc_regions_used++];
1628 region->ino = ino;
1629 region->lblk = lblk;
1630 region->pblk = pblk;
1631 region->len = len;
1632
1633 return 0;
1634 }
1635
1636 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1637 static int ext4_fc_replay_add_range(struct super_block *sb,
1638 struct ext4_fc_tl *tl, u8 *val)
1639 {
1640 struct ext4_fc_add_range fc_add_ex;
1641 struct ext4_extent newex, *ex;
1642 struct inode *inode;
1643 ext4_lblk_t start, cur;
1644 int remaining, len;
1645 ext4_fsblk_t start_pblk;
1646 struct ext4_map_blocks map;
1647 struct ext4_ext_path *path = NULL;
1648 int ret;
1649
1650 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1651 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1652
1653 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1654 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1655 ext4_ext_get_actual_len(ex));
1656
1657 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1658 if (IS_ERR(inode)) {
1659 jbd_debug(1, "Inode not found.");
1660 return 0;
1661 }
1662
1663 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1664
1665 start = le32_to_cpu(ex->ee_block);
1666 start_pblk = ext4_ext_pblock(ex);
1667 len = ext4_ext_get_actual_len(ex);
1668
1669 cur = start;
1670 remaining = len;
1671 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1672 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1673 inode->i_ino);
1674
1675 while (remaining > 0) {
1676 map.m_lblk = cur;
1677 map.m_len = remaining;
1678 map.m_pblk = 0;
1679 ret = ext4_map_blocks(NULL, inode, &map, 0);
1680
1681 if (ret < 0) {
1682 iput(inode);
1683 return 0;
1684 }
1685
1686 if (ret == 0) {
1687 /* Range is not mapped */
1688 path = ext4_find_extent(inode, cur, NULL, 0);
1689 if (IS_ERR(path)) {
1690 iput(inode);
1691 return 0;
1692 }
1693 memset(&newex, 0, sizeof(newex));
1694 newex.ee_block = cpu_to_le32(cur);
1695 ext4_ext_store_pblock(
1696 &newex, start_pblk + cur - start);
1697 newex.ee_len = cpu_to_le16(map.m_len);
1698 if (ext4_ext_is_unwritten(ex))
1699 ext4_ext_mark_unwritten(&newex);
1700 down_write(&EXT4_I(inode)->i_data_sem);
1701 ret = ext4_ext_insert_extent(
1702 NULL, inode, &path, &newex, 0);
1703 up_write((&EXT4_I(inode)->i_data_sem));
1704 ext4_ext_drop_refs(path);
1705 kfree(path);
1706 if (ret) {
1707 iput(inode);
1708 return 0;
1709 }
1710 goto next;
1711 }
1712
1713 if (start_pblk + cur - start != map.m_pblk) {
1714 /*
1715 * Logical to physical mapping changed. This can happen
1716 * if this range was removed and then reallocated to
1717 * map to new physical blocks during a fast commit.
1718 */
1719 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1720 ext4_ext_is_unwritten(ex),
1721 start_pblk + cur - start);
1722 if (ret) {
1723 iput(inode);
1724 return 0;
1725 }
1726 /*
1727 * Mark the old blocks as free since they aren't used
1728 * anymore. We maintain an array of all the modified
1729 * inodes. In case these blocks are still used at either
1730 * a different logical range in the same inode or in
1731 * some different inode, we will mark them as allocated
1732 * at the end of the FC replay using our array of
1733 * modified inodes.
1734 */
1735 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1736 goto next;
1737 }
1738
1739 /* Range is mapped and needs a state change */
1740 jbd_debug(1, "Converting from %ld to %d %lld",
1741 map.m_flags & EXT4_MAP_UNWRITTEN,
1742 ext4_ext_is_unwritten(ex), map.m_pblk);
1743 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1744 ext4_ext_is_unwritten(ex), map.m_pblk);
1745 if (ret) {
1746 iput(inode);
1747 return 0;
1748 }
1749 /*
1750 * We may have split the extent tree while toggling the state.
1751 * Try to shrink the extent tree now.
1752 */
1753 ext4_ext_replay_shrink_inode(inode, start + len);
1754 next:
1755 cur += map.m_len;
1756 remaining -= map.m_len;
1757 }
1758 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1759 sb->s_blocksize_bits);
1760 iput(inode);
1761 return 0;
1762 }
1763
1764 /* Replay DEL_RANGE tag */
1765 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1766 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1767 u8 *val)
1768 {
1769 struct inode *inode;
1770 struct ext4_fc_del_range lrange;
1771 struct ext4_map_blocks map;
1772 ext4_lblk_t cur, remaining;
1773 int ret;
1774
1775 memcpy(&lrange, val, sizeof(lrange));
1776 cur = le32_to_cpu(lrange.fc_lblk);
1777 remaining = le32_to_cpu(lrange.fc_len);
1778
1779 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1780 le32_to_cpu(lrange.fc_ino), cur, remaining);
1781
1782 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1783 if (IS_ERR(inode)) {
1784 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1785 return 0;
1786 }
1787
1788 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1789
1790 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1791 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1792 le32_to_cpu(lrange.fc_len));
1793 while (remaining > 0) {
1794 map.m_lblk = cur;
1795 map.m_len = remaining;
1796
1797 ret = ext4_map_blocks(NULL, inode, &map, 0);
1798 if (ret < 0) {
1799 iput(inode);
1800 return 0;
1801 }
1802 if (ret > 0) {
1803 remaining -= ret;
1804 cur += ret;
1805 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1806 } else {
1807 remaining -= map.m_len;
1808 cur += map.m_len;
1809 }
1810 }
1811
1812 ret = ext4_punch_hole(inode,
1813 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1814 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
1815 if (ret)
1816 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1817 ext4_ext_replay_shrink_inode(inode,
1818 i_size_read(inode) >> sb->s_blocksize_bits);
1819 ext4_mark_inode_dirty(NULL, inode);
1820 iput(inode);
1821
1822 return 0;
1823 }
1824
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1825 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1826 {
1827 struct ext4_fc_replay_state *state;
1828 struct inode *inode;
1829 struct ext4_ext_path *path = NULL;
1830 struct ext4_map_blocks map;
1831 int i, ret, j;
1832 ext4_lblk_t cur, end;
1833
1834 state = &EXT4_SB(sb)->s_fc_replay_state;
1835 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1836 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1837 EXT4_IGET_NORMAL);
1838 if (IS_ERR(inode)) {
1839 jbd_debug(1, "Inode %d not found.",
1840 state->fc_modified_inodes[i]);
1841 continue;
1842 }
1843 cur = 0;
1844 end = EXT_MAX_BLOCKS;
1845 while (cur < end) {
1846 map.m_lblk = cur;
1847 map.m_len = end - cur;
1848
1849 ret = ext4_map_blocks(NULL, inode, &map, 0);
1850 if (ret < 0)
1851 break;
1852
1853 if (ret > 0) {
1854 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1855 if (!IS_ERR(path)) {
1856 for (j = 0; j < path->p_depth; j++)
1857 ext4_mb_mark_bb(inode->i_sb,
1858 path[j].p_block, 1, 1);
1859 ext4_ext_drop_refs(path);
1860 kfree(path);
1861 }
1862 cur += ret;
1863 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1864 map.m_len, 1);
1865 } else {
1866 cur = cur + (map.m_len ? map.m_len : 1);
1867 }
1868 }
1869 iput(inode);
1870 }
1871 }
1872
1873 /*
1874 * Check if block is in excluded regions for block allocation. The simple
1875 * allocator that runs during replay phase is calls this function to see
1876 * if it is okay to use a block.
1877 */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1878 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1879 {
1880 int i;
1881 struct ext4_fc_replay_state *state;
1882
1883 state = &EXT4_SB(sb)->s_fc_replay_state;
1884 for (i = 0; i < state->fc_regions_valid; i++) {
1885 if (state->fc_regions[i].ino == 0 ||
1886 state->fc_regions[i].len == 0)
1887 continue;
1888 if (blk >= state->fc_regions[i].pblk &&
1889 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1890 return true;
1891 }
1892 return false;
1893 }
1894
1895 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1896 void ext4_fc_replay_cleanup(struct super_block *sb)
1897 {
1898 struct ext4_sb_info *sbi = EXT4_SB(sb);
1899
1900 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1901 kfree(sbi->s_fc_replay_state.fc_regions);
1902 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1903 }
1904
1905 /*
1906 * Recovery Scan phase handler
1907 *
1908 * This function is called during the scan phase and is responsible
1909 * for doing following things:
1910 * - Make sure the fast commit area has valid tags for replay
1911 * - Count number of tags that need to be replayed by the replay handler
1912 * - Verify CRC
1913 * - Create a list of excluded blocks for allocation during replay phase
1914 *
1915 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1916 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1917 * to indicate that scan has finished and JBD2 can now start replay phase.
1918 * It returns a negative error to indicate that there was an error. At the end
1919 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1920 * to indicate the number of tags that need to replayed during the replay phase.
1921 */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)1922 static int ext4_fc_replay_scan(journal_t *journal,
1923 struct buffer_head *bh, int off,
1924 tid_t expected_tid)
1925 {
1926 struct super_block *sb = journal->j_private;
1927 struct ext4_sb_info *sbi = EXT4_SB(sb);
1928 struct ext4_fc_replay_state *state;
1929 int ret = JBD2_FC_REPLAY_CONTINUE;
1930 struct ext4_fc_add_range ext;
1931 struct ext4_fc_tl tl;
1932 struct ext4_fc_tail tail;
1933 __u8 *start, *end, *cur, *val;
1934 struct ext4_fc_head head;
1935 struct ext4_extent *ex;
1936
1937 state = &sbi->s_fc_replay_state;
1938
1939 start = (u8 *)bh->b_data;
1940 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1941
1942 if (state->fc_replay_expected_off == 0) {
1943 state->fc_cur_tag = 0;
1944 state->fc_replay_num_tags = 0;
1945 state->fc_crc = 0;
1946 state->fc_regions = NULL;
1947 state->fc_regions_valid = state->fc_regions_used =
1948 state->fc_regions_size = 0;
1949 /* Check if we can stop early */
1950 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1951 != EXT4_FC_TAG_HEAD)
1952 return 0;
1953 }
1954
1955 if (off != state->fc_replay_expected_off) {
1956 ret = -EFSCORRUPTED;
1957 goto out_err;
1958 }
1959
1960 state->fc_replay_expected_off++;
1961 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1962 memcpy(&tl, cur, sizeof(tl));
1963 val = cur + sizeof(tl);
1964 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1965 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1966 switch (le16_to_cpu(tl.fc_tag)) {
1967 case EXT4_FC_TAG_ADD_RANGE:
1968 memcpy(&ext, val, sizeof(ext));
1969 ex = (struct ext4_extent *)&ext.fc_ex;
1970 ret = ext4_fc_record_regions(sb,
1971 le32_to_cpu(ext.fc_ino),
1972 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1973 ext4_ext_get_actual_len(ex));
1974 if (ret < 0)
1975 break;
1976 ret = JBD2_FC_REPLAY_CONTINUE;
1977 fallthrough;
1978 case EXT4_FC_TAG_DEL_RANGE:
1979 case EXT4_FC_TAG_LINK:
1980 case EXT4_FC_TAG_UNLINK:
1981 case EXT4_FC_TAG_CREAT:
1982 case EXT4_FC_TAG_INODE:
1983 case EXT4_FC_TAG_PAD:
1984 state->fc_cur_tag++;
1985 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1986 sizeof(tl) + le16_to_cpu(tl.fc_len));
1987 break;
1988 case EXT4_FC_TAG_TAIL:
1989 state->fc_cur_tag++;
1990 memcpy(&tail, val, sizeof(tail));
1991 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1992 sizeof(tl) +
1993 offsetof(struct ext4_fc_tail,
1994 fc_crc));
1995 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1996 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1997 state->fc_replay_num_tags = state->fc_cur_tag;
1998 state->fc_regions_valid =
1999 state->fc_regions_used;
2000 } else {
2001 ret = state->fc_replay_num_tags ?
2002 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2003 }
2004 state->fc_crc = 0;
2005 break;
2006 case EXT4_FC_TAG_HEAD:
2007 memcpy(&head, val, sizeof(head));
2008 if (le32_to_cpu(head.fc_features) &
2009 ~EXT4_FC_SUPPORTED_FEATURES) {
2010 ret = -EOPNOTSUPP;
2011 break;
2012 }
2013 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2014 ret = JBD2_FC_REPLAY_STOP;
2015 break;
2016 }
2017 state->fc_cur_tag++;
2018 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2019 sizeof(tl) + le16_to_cpu(tl.fc_len));
2020 break;
2021 default:
2022 ret = state->fc_replay_num_tags ?
2023 JBD2_FC_REPLAY_STOP : -ECANCELED;
2024 }
2025 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2026 break;
2027 }
2028
2029 out_err:
2030 trace_ext4_fc_replay_scan(sb, ret, off);
2031 return ret;
2032 }
2033
2034 /*
2035 * Main recovery path entry point.
2036 * The meaning of return codes is similar as above.
2037 */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2038 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2039 enum passtype pass, int off, tid_t expected_tid)
2040 {
2041 struct super_block *sb = journal->j_private;
2042 struct ext4_sb_info *sbi = EXT4_SB(sb);
2043 struct ext4_fc_tl tl;
2044 __u8 *start, *end, *cur, *val;
2045 int ret = JBD2_FC_REPLAY_CONTINUE;
2046 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2047 struct ext4_fc_tail tail;
2048
2049 if (pass == PASS_SCAN) {
2050 state->fc_current_pass = PASS_SCAN;
2051 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2052 }
2053
2054 if (state->fc_current_pass != pass) {
2055 state->fc_current_pass = pass;
2056 sbi->s_mount_state |= EXT4_FC_REPLAY;
2057 }
2058 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2059 jbd_debug(1, "Replay stops\n");
2060 ext4_fc_set_bitmaps_and_counters(sb);
2061 return 0;
2062 }
2063
2064 #ifdef CONFIG_EXT4_DEBUG
2065 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2066 pr_warn("Dropping fc block %d because max_replay set\n", off);
2067 return JBD2_FC_REPLAY_STOP;
2068 }
2069 #endif
2070
2071 start = (u8 *)bh->b_data;
2072 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2073
2074 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2075 memcpy(&tl, cur, sizeof(tl));
2076 val = cur + sizeof(tl);
2077
2078 if (state->fc_replay_num_tags == 0) {
2079 ret = JBD2_FC_REPLAY_STOP;
2080 ext4_fc_set_bitmaps_and_counters(sb);
2081 break;
2082 }
2083 jbd_debug(3, "Replay phase, tag:%s\n",
2084 tag2str(le16_to_cpu(tl.fc_tag)));
2085 state->fc_replay_num_tags--;
2086 switch (le16_to_cpu(tl.fc_tag)) {
2087 case EXT4_FC_TAG_LINK:
2088 ret = ext4_fc_replay_link(sb, &tl, val);
2089 break;
2090 case EXT4_FC_TAG_UNLINK:
2091 ret = ext4_fc_replay_unlink(sb, &tl, val);
2092 break;
2093 case EXT4_FC_TAG_ADD_RANGE:
2094 ret = ext4_fc_replay_add_range(sb, &tl, val);
2095 break;
2096 case EXT4_FC_TAG_CREAT:
2097 ret = ext4_fc_replay_create(sb, &tl, val);
2098 break;
2099 case EXT4_FC_TAG_DEL_RANGE:
2100 ret = ext4_fc_replay_del_range(sb, &tl, val);
2101 break;
2102 case EXT4_FC_TAG_INODE:
2103 ret = ext4_fc_replay_inode(sb, &tl, val);
2104 break;
2105 case EXT4_FC_TAG_PAD:
2106 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2107 le16_to_cpu(tl.fc_len), 0);
2108 break;
2109 case EXT4_FC_TAG_TAIL:
2110 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2111 le16_to_cpu(tl.fc_len), 0);
2112 memcpy(&tail, val, sizeof(tail));
2113 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2114 break;
2115 case EXT4_FC_TAG_HEAD:
2116 break;
2117 default:
2118 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2119 le16_to_cpu(tl.fc_len), 0);
2120 ret = -ECANCELED;
2121 break;
2122 }
2123 if (ret < 0)
2124 break;
2125 ret = JBD2_FC_REPLAY_CONTINUE;
2126 }
2127 return ret;
2128 }
2129
ext4_fc_init(struct super_block * sb,journal_t * journal)2130 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2131 {
2132 /*
2133 * We set replay callback even if fast commit disabled because we may
2134 * could still have fast commit blocks that need to be replayed even if
2135 * fast commit has now been turned off.
2136 */
2137 journal->j_fc_replay_callback = ext4_fc_replay;
2138 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2139 return;
2140 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2141 }
2142
2143 static const char *fc_ineligible_reasons[] = {
2144 "Extended attributes changed",
2145 "Cross rename",
2146 "Journal flag changed",
2147 "Insufficient memory",
2148 "Swap boot",
2149 "Resize",
2150 "Dir renamed",
2151 "Falloc range op",
2152 "Data journalling",
2153 "FC Commit Failed"
2154 };
2155
ext4_fc_info_show(struct seq_file * seq,void * v)2156 int ext4_fc_info_show(struct seq_file *seq, void *v)
2157 {
2158 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2159 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2160 int i;
2161
2162 if (v != SEQ_START_TOKEN)
2163 return 0;
2164
2165 seq_printf(seq,
2166 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2167 stats->fc_num_commits, stats->fc_ineligible_commits,
2168 stats->fc_numblks,
2169 div_u64(sbi->s_fc_avg_commit_time, 1000));
2170 seq_puts(seq, "Ineligible reasons:\n");
2171 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2172 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2173 stats->fc_ineligible_reason_count[i]);
2174
2175 return 0;
2176 }
2177
ext4_fc_init_dentry_cache(void)2178 int __init ext4_fc_init_dentry_cache(void)
2179 {
2180 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2181 SLAB_RECLAIM_ACCOUNT);
2182
2183 if (ext4_fc_dentry_cachep == NULL)
2184 return -ENOMEM;
2185
2186 return 0;
2187 }
2188