1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136 }
137
ext4_fc_reset_inode(struct inode * inode)138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144 }
145
ext4_fc_init_inode(struct inode * inode)146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 }
156
157 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)158 static void ext4_fc_wait_committing_inode(struct inode *inode)
159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160 {
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164 #if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169 #else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174 #endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180 }
181
182 /*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
ext4_fc_start_update(struct inode * inode)189 void ext4_fc_start_update(struct inode *inode)
190 {
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 return;
196
197 restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 ext4_fc_wait_committing_inode(inode);
204 goto restart;
205 }
206 out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210
211 /*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
ext4_fc_stop_update(struct inode * inode)214 void ext4_fc_stop_update(struct inode *inode)
215 {
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224 }
225
226 /*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
ext4_fc_del(struct inode * inode)230 void ext4_fc_del(struct inode *inode)
231 {
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 return;
237
238 restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 ext4_fc_wait_committing_inode(inode);
247 goto restart;
248 }
249 list_del_init(&ei->i_fc_list);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252
253 /*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
ext4_fc_mark_ineligible(struct super_block * sb,int reason)257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
265 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269
270 /*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
ext4_fc_start_ineligible(struct super_block * sb,int reason)274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286
287 /*
288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
ext4_fc_stop_ineligible(struct super_block * sb)292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
298 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301
ext4_fc_is_ineligible(struct super_block * sb)302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306 }
307
308 /*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)318 static int ext4_fc_track_template(
319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
321 void *args, int enqueue)
322 {
323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 tid_t tid = 0;
327 int ret;
328
329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
336 tid = handle->h_transaction->t_tid;
337 mutex_lock(&ei->i_fc_lock);
338 if (tid == ei->i_sync_tid) {
339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
342 ei->i_sync_tid = tid;
343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
353 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359 }
360
361 struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364 };
365
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
392 EXT4_FC_REASON_NOMEM);
393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
406 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415 }
416
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)417 void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
419 {
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)436 void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
438 {
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
456 {
457 struct __track_dentry_update_args args;
458 struct inode *inode = d_inode(dentry);
459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468
469 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)470 static int __track_inode(struct inode *inode, void *arg, bool update)
471 {
472 if (update)
473 return -EEXIST;
474
475 EXT4_I(inode)->i_fc_lblk_len = 0;
476
477 return 0;
478 }
479
ext4_fc_track_inode(handle_t * handle,struct inode * inode)480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
481 {
482 int ret;
483
484 if (S_ISDIR(inode->i_mode))
485 return;
486
487 if (ext4_should_journal_data(inode)) {
488 ext4_fc_mark_ineligible(inode->i_sb,
489 EXT4_FC_REASON_INODE_JOURNAL_DATA);
490 return;
491 }
492
493 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
494 trace_ext4_fc_track_inode(inode, ret);
495 }
496
497 struct __track_range_args {
498 ext4_lblk_t start, end;
499 };
500
501 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)502 static int __track_range(struct inode *inode, void *arg, bool update)
503 {
504 struct ext4_inode_info *ei = EXT4_I(inode);
505 ext4_lblk_t oldstart;
506 struct __track_range_args *__arg =
507 (struct __track_range_args *)arg;
508
509 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
510 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
511 return -ECANCELED;
512 }
513
514 oldstart = ei->i_fc_lblk_start;
515
516 if (update && ei->i_fc_lblk_len > 0) {
517 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
518 ei->i_fc_lblk_len =
519 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
520 ei->i_fc_lblk_start + 1;
521 } else {
522 ei->i_fc_lblk_start = __arg->start;
523 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
524 }
525
526 return 0;
527 }
528
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)529 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
530 ext4_lblk_t end)
531 {
532 struct __track_range_args args;
533 int ret;
534
535 if (S_ISDIR(inode->i_mode))
536 return;
537
538 args.start = start;
539 args.end = end;
540
541 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
542
543 trace_ext4_fc_track_range(inode, start, end, ret);
544 }
545
ext4_fc_submit_bh(struct super_block * sb)546 static void ext4_fc_submit_bh(struct super_block *sb)
547 {
548 int write_flags = REQ_SYNC;
549 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
550
551 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
552 if (test_opt(sb, BARRIER))
553 write_flags |= REQ_FUA | REQ_PREFLUSH;
554 lock_buffer(bh);
555 set_buffer_dirty(bh);
556 set_buffer_uptodate(bh);
557 bh->b_end_io = ext4_end_buffer_io_sync;
558 submit_bh(REQ_OP_WRITE, write_flags, bh);
559 EXT4_SB(sb)->s_fc_bh = NULL;
560 }
561
562 /* Ext4 commit path routines */
563
564 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)565 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
566 u32 *crc)
567 {
568 void *ret;
569
570 ret = memset(dst, 0, len);
571 if (crc)
572 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
573 return ret;
574 }
575
576 /*
577 * Allocate len bytes on a fast commit buffer.
578 *
579 * During the commit time this function is used to manage fast commit
580 * block space. We don't split a fast commit log onto different
581 * blocks. So this function makes sure that if there's not enough space
582 * on the current block, the remaining space in the current block is
583 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
584 * new block is from jbd2 and CRC is updated to reflect the padding
585 * we added.
586 */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)587 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
588 {
589 struct ext4_fc_tl *tl;
590 struct ext4_sb_info *sbi = EXT4_SB(sb);
591 struct buffer_head *bh;
592 int bsize = sbi->s_journal->j_blocksize;
593 int ret, off = sbi->s_fc_bytes % bsize;
594 int pad_len;
595
596 /*
597 * After allocating len, we should have space at least for a 0 byte
598 * padding.
599 */
600 if (len + sizeof(struct ext4_fc_tl) > bsize)
601 return NULL;
602
603 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
604 /*
605 * Only allocate from current buffer if we have enough space for
606 * this request AND we have space to add a zero byte padding.
607 */
608 if (!sbi->s_fc_bh) {
609 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
610 if (ret)
611 return NULL;
612 sbi->s_fc_bh = bh;
613 }
614 sbi->s_fc_bytes += len;
615 return sbi->s_fc_bh->b_data + off;
616 }
617 /* Need to add PAD tag */
618 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
619 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
620 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
621 tl->fc_len = cpu_to_le16(pad_len);
622 if (crc)
623 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
624 if (pad_len > 0)
625 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
626 ext4_fc_submit_bh(sb);
627
628 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
629 if (ret)
630 return NULL;
631 sbi->s_fc_bh = bh;
632 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
633 return sbi->s_fc_bh->b_data;
634 }
635
636 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)637 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
638 int len, u32 *crc)
639 {
640 if (crc)
641 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
642 return memcpy(dst, src, len);
643 }
644
645 /*
646 * Complete a fast commit by writing tail tag.
647 *
648 * Writing tail tag marks the end of a fast commit. In order to guarantee
649 * atomicity, after writing tail tag, even if there's space remaining
650 * in the block, next commit shouldn't use it. That's why tail tag
651 * has the length as that of the remaining space on the block.
652 */
ext4_fc_write_tail(struct super_block * sb,u32 crc)653 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
654 {
655 struct ext4_sb_info *sbi = EXT4_SB(sb);
656 struct ext4_fc_tl tl;
657 struct ext4_fc_tail tail;
658 int off, bsize = sbi->s_journal->j_blocksize;
659 u8 *dst;
660
661 /*
662 * ext4_fc_reserve_space takes care of allocating an extra block if
663 * there's no enough space on this block for accommodating this tail.
664 */
665 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
666 if (!dst)
667 return -ENOSPC;
668
669 off = sbi->s_fc_bytes % bsize;
670
671 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
672 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
673 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
674
675 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
676 dst += sizeof(tl);
677 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
678 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
679 dst += sizeof(tail.fc_tid);
680 tail.fc_crc = cpu_to_le32(crc);
681 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
682
683 ext4_fc_submit_bh(sb);
684
685 return 0;
686 }
687
688 /*
689 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
690 * Returns false if there's not enough space.
691 */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)692 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
693 u32 *crc)
694 {
695 struct ext4_fc_tl tl;
696 u8 *dst;
697
698 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
699 if (!dst)
700 return false;
701
702 tl.fc_tag = cpu_to_le16(tag);
703 tl.fc_len = cpu_to_le16(len);
704
705 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
706 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
707
708 return true;
709 }
710
711 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u16 tag,int parent_ino,int ino,int dlen,const unsigned char * dname,u32 * crc)712 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
713 int parent_ino, int ino, int dlen,
714 const unsigned char *dname,
715 u32 *crc)
716 {
717 struct ext4_fc_dentry_info fcd;
718 struct ext4_fc_tl tl;
719 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
720 crc);
721
722 if (!dst)
723 return false;
724
725 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
726 fcd.fc_ino = cpu_to_le32(ino);
727 tl.fc_tag = cpu_to_le16(tag);
728 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
729 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
730 dst += sizeof(tl);
731 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
732 dst += sizeof(fcd);
733 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
734 dst += dlen;
735
736 return true;
737 }
738
739 /*
740 * Writes inode in the fast commit space under TLV with tag @tag.
741 * Returns 0 on success, error on failure.
742 */
ext4_fc_write_inode(struct inode * inode,u32 * crc)743 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
744 {
745 struct ext4_inode_info *ei = EXT4_I(inode);
746 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
747 int ret;
748 struct ext4_iloc iloc;
749 struct ext4_fc_inode fc_inode;
750 struct ext4_fc_tl tl;
751 u8 *dst;
752
753 ret = ext4_get_inode_loc(inode, &iloc);
754 if (ret)
755 return ret;
756
757 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
758 inode_len += ei->i_extra_isize;
759
760 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
761 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
762 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
763
764 dst = ext4_fc_reserve_space(inode->i_sb,
765 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
766 if (!dst)
767 return -ECANCELED;
768
769 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
770 return -ECANCELED;
771 dst += sizeof(tl);
772 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
773 return -ECANCELED;
774 dst += sizeof(fc_inode);
775 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
776 inode_len, crc))
777 return -ECANCELED;
778
779 return 0;
780 }
781
782 /*
783 * Writes updated data ranges for the inode in question. Updates CRC.
784 * Returns 0 on success, error otherwise.
785 */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)786 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
787 {
788 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
789 struct ext4_inode_info *ei = EXT4_I(inode);
790 struct ext4_map_blocks map;
791 struct ext4_fc_add_range fc_ext;
792 struct ext4_fc_del_range lrange;
793 struct ext4_extent *ex;
794 int ret;
795
796 mutex_lock(&ei->i_fc_lock);
797 if (ei->i_fc_lblk_len == 0) {
798 mutex_unlock(&ei->i_fc_lock);
799 return 0;
800 }
801 old_blk_size = ei->i_fc_lblk_start;
802 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
803 ei->i_fc_lblk_len = 0;
804 mutex_unlock(&ei->i_fc_lock);
805
806 cur_lblk_off = old_blk_size;
807 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
808 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
809
810 while (cur_lblk_off <= new_blk_size) {
811 map.m_lblk = cur_lblk_off;
812 map.m_len = new_blk_size - cur_lblk_off + 1;
813 ret = ext4_map_blocks(NULL, inode, &map, 0);
814 if (ret < 0)
815 return -ECANCELED;
816
817 if (map.m_len == 0) {
818 cur_lblk_off++;
819 continue;
820 }
821
822 if (ret == 0) {
823 lrange.fc_ino = cpu_to_le32(inode->i_ino);
824 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
825 lrange.fc_len = cpu_to_le32(map.m_len);
826 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
827 sizeof(lrange), (u8 *)&lrange, crc))
828 return -ENOSPC;
829 } else {
830 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
831 ex = (struct ext4_extent *)&fc_ext.fc_ex;
832 ex->ee_block = cpu_to_le32(map.m_lblk);
833 ex->ee_len = cpu_to_le16(map.m_len);
834 ext4_ext_store_pblock(ex, map.m_pblk);
835 if (map.m_flags & EXT4_MAP_UNWRITTEN)
836 ext4_ext_mark_unwritten(ex);
837 else
838 ext4_ext_mark_initialized(ex);
839 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
840 sizeof(fc_ext), (u8 *)&fc_ext, crc))
841 return -ENOSPC;
842 }
843
844 cur_lblk_off += map.m_len;
845 }
846
847 return 0;
848 }
849
850
851 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)852 static int ext4_fc_submit_inode_data_all(journal_t *journal)
853 {
854 struct super_block *sb = (struct super_block *)(journal->j_private);
855 struct ext4_sb_info *sbi = EXT4_SB(sb);
856 struct ext4_inode_info *ei;
857 struct list_head *pos;
858 int ret = 0;
859
860 spin_lock(&sbi->s_fc_lock);
861 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
862 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
863 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
864 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
865 while (atomic_read(&ei->i_fc_updates)) {
866 DEFINE_WAIT(wait);
867
868 prepare_to_wait(&ei->i_fc_wait, &wait,
869 TASK_UNINTERRUPTIBLE);
870 if (atomic_read(&ei->i_fc_updates)) {
871 spin_unlock(&sbi->s_fc_lock);
872 schedule();
873 spin_lock(&sbi->s_fc_lock);
874 }
875 finish_wait(&ei->i_fc_wait, &wait);
876 }
877 spin_unlock(&sbi->s_fc_lock);
878 ret = jbd2_submit_inode_data(ei->jinode);
879 if (ret)
880 return ret;
881 spin_lock(&sbi->s_fc_lock);
882 }
883 spin_unlock(&sbi->s_fc_lock);
884
885 return ret;
886 }
887
888 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)889 static int ext4_fc_wait_inode_data_all(journal_t *journal)
890 {
891 struct super_block *sb = (struct super_block *)(journal->j_private);
892 struct ext4_sb_info *sbi = EXT4_SB(sb);
893 struct ext4_inode_info *pos, *n;
894 int ret = 0;
895
896 spin_lock(&sbi->s_fc_lock);
897 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
898 if (!ext4_test_inode_state(&pos->vfs_inode,
899 EXT4_STATE_FC_COMMITTING))
900 continue;
901 spin_unlock(&sbi->s_fc_lock);
902
903 ret = jbd2_wait_inode_data(journal, pos->jinode);
904 if (ret)
905 return ret;
906 spin_lock(&sbi->s_fc_lock);
907 }
908 spin_unlock(&sbi->s_fc_lock);
909
910 return 0;
911 }
912
913 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)914 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
915 __acquires(&sbi->s_fc_lock)
916 __releases(&sbi->s_fc_lock)
917 {
918 struct super_block *sb = (struct super_block *)(journal->j_private);
919 struct ext4_sb_info *sbi = EXT4_SB(sb);
920 struct ext4_fc_dentry_update *fc_dentry;
921 struct inode *inode;
922 struct list_head *pos, *n, *fcd_pos, *fcd_n;
923 struct ext4_inode_info *ei;
924 int ret;
925
926 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
927 return 0;
928 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
929 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
930 fcd_list);
931 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
932 spin_unlock(&sbi->s_fc_lock);
933 if (!ext4_fc_add_dentry_tlv(
934 sb, fc_dentry->fcd_op,
935 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
936 fc_dentry->fcd_name.len,
937 fc_dentry->fcd_name.name, crc)) {
938 ret = -ENOSPC;
939 goto lock_and_exit;
940 }
941 spin_lock(&sbi->s_fc_lock);
942 continue;
943 }
944
945 inode = NULL;
946 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
947 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
948 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
949 inode = &ei->vfs_inode;
950 break;
951 }
952 }
953 /*
954 * If we don't find inode in our list, then it was deleted,
955 * in which case, we don't need to record it's create tag.
956 */
957 if (!inode)
958 continue;
959 spin_unlock(&sbi->s_fc_lock);
960
961 /*
962 * We first write the inode and then the create dirent. This
963 * allows the recovery code to create an unnamed inode first
964 * and then link it to a directory entry. This allows us
965 * to use namei.c routines almost as is and simplifies
966 * the recovery code.
967 */
968 ret = ext4_fc_write_inode(inode, crc);
969 if (ret)
970 goto lock_and_exit;
971
972 ret = ext4_fc_write_inode_data(inode, crc);
973 if (ret)
974 goto lock_and_exit;
975
976 if (!ext4_fc_add_dentry_tlv(
977 sb, fc_dentry->fcd_op,
978 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
979 fc_dentry->fcd_name.len,
980 fc_dentry->fcd_name.name, crc)) {
981 ret = -ENOSPC;
982 goto lock_and_exit;
983 }
984
985 spin_lock(&sbi->s_fc_lock);
986 }
987 return 0;
988 lock_and_exit:
989 spin_lock(&sbi->s_fc_lock);
990 return ret;
991 }
992
ext4_fc_perform_commit(journal_t * journal)993 static int ext4_fc_perform_commit(journal_t *journal)
994 {
995 struct super_block *sb = (struct super_block *)(journal->j_private);
996 struct ext4_sb_info *sbi = EXT4_SB(sb);
997 struct ext4_inode_info *iter;
998 struct ext4_fc_head head;
999 struct list_head *pos;
1000 struct inode *inode;
1001 struct blk_plug plug;
1002 int ret = 0;
1003 u32 crc = 0;
1004
1005 ret = ext4_fc_submit_inode_data_all(journal);
1006 if (ret)
1007 return ret;
1008
1009 ret = ext4_fc_wait_inode_data_all(journal);
1010 if (ret)
1011 return ret;
1012
1013 /*
1014 * If file system device is different from journal device, issue a cache
1015 * flush before we start writing fast commit blocks.
1016 */
1017 if (journal->j_fs_dev != journal->j_dev)
1018 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1019
1020 blk_start_plug(&plug);
1021 if (sbi->s_fc_bytes == 0) {
1022 /*
1023 * Add a head tag only if this is the first fast commit
1024 * in this TID.
1025 */
1026 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1027 head.fc_tid = cpu_to_le32(
1028 sbi->s_journal->j_running_transaction->t_tid);
1029 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1030 (u8 *)&head, &crc))
1031 goto out;
1032 }
1033
1034 spin_lock(&sbi->s_fc_lock);
1035 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1036 if (ret) {
1037 spin_unlock(&sbi->s_fc_lock);
1038 goto out;
1039 }
1040
1041 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1042 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1043 inode = &iter->vfs_inode;
1044 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1045 continue;
1046
1047 spin_unlock(&sbi->s_fc_lock);
1048 ret = ext4_fc_write_inode_data(inode, &crc);
1049 if (ret)
1050 goto out;
1051 ret = ext4_fc_write_inode(inode, &crc);
1052 if (ret)
1053 goto out;
1054 spin_lock(&sbi->s_fc_lock);
1055 }
1056 spin_unlock(&sbi->s_fc_lock);
1057
1058 ret = ext4_fc_write_tail(sb, crc);
1059
1060 out:
1061 blk_finish_plug(&plug);
1062 return ret;
1063 }
1064
1065 /*
1066 * The main commit entry point. Performs a fast commit for transaction
1067 * commit_tid if needed. If it's not possible to perform a fast commit
1068 * due to various reasons, we fall back to full commit. Returns 0
1069 * on success, error otherwise.
1070 */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1071 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1072 {
1073 struct super_block *sb = (struct super_block *)(journal->j_private);
1074 struct ext4_sb_info *sbi = EXT4_SB(sb);
1075 int nblks = 0, ret, bsize = journal->j_blocksize;
1076 int subtid = atomic_read(&sbi->s_fc_subtid);
1077 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1078 ktime_t start_time, commit_time;
1079
1080 trace_ext4_fc_commit_start(sb);
1081
1082 start_time = ktime_get();
1083
1084 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1085 (ext4_fc_is_ineligible(sb))) {
1086 reason = EXT4_FC_REASON_INELIGIBLE;
1087 goto out;
1088 }
1089
1090 restart_fc:
1091 ret = jbd2_fc_begin_commit(journal, commit_tid);
1092 if (ret == -EALREADY) {
1093 /* There was an ongoing commit, check if we need to restart */
1094 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1095 commit_tid > journal->j_commit_sequence)
1096 goto restart_fc;
1097 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1098 goto out;
1099 } else if (ret) {
1100 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1101 reason = EXT4_FC_REASON_FC_START_FAILED;
1102 goto out;
1103 }
1104
1105 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1106 ret = ext4_fc_perform_commit(journal);
1107 if (ret < 0) {
1108 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109 reason = EXT4_FC_REASON_FC_FAILED;
1110 goto out;
1111 }
1112 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1113 ret = jbd2_fc_wait_bufs(journal, nblks);
1114 if (ret < 0) {
1115 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1116 reason = EXT4_FC_REASON_FC_FAILED;
1117 goto out;
1118 }
1119 atomic_inc(&sbi->s_fc_subtid);
1120 jbd2_fc_end_commit(journal);
1121 out:
1122 /* Has any ineligible update happened since we started? */
1123 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1124 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1125 reason = EXT4_FC_REASON_INELIGIBLE;
1126 }
1127
1128 spin_lock(&sbi->s_fc_lock);
1129 if (reason != EXT4_FC_REASON_OK &&
1130 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1131 sbi->s_fc_stats.fc_ineligible_commits++;
1132 } else {
1133 sbi->s_fc_stats.fc_num_commits++;
1134 sbi->s_fc_stats.fc_numblks += nblks;
1135 }
1136 spin_unlock(&sbi->s_fc_lock);
1137 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1138 trace_ext4_fc_commit_stop(sb, nblks, reason);
1139 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1140 /*
1141 * weight the commit time higher than the average time so we don't
1142 * react too strongly to vast changes in the commit time
1143 */
1144 if (likely(sbi->s_fc_avg_commit_time))
1145 sbi->s_fc_avg_commit_time = (commit_time +
1146 sbi->s_fc_avg_commit_time * 3) / 4;
1147 else
1148 sbi->s_fc_avg_commit_time = commit_time;
1149 jbd_debug(1,
1150 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1151 nblks, reason, subtid);
1152 if (reason == EXT4_FC_REASON_FC_FAILED)
1153 return jbd2_fc_end_commit_fallback(journal);
1154 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1155 reason == EXT4_FC_REASON_INELIGIBLE)
1156 return jbd2_complete_transaction(journal, commit_tid);
1157 return 0;
1158 }
1159
1160 /*
1161 * Fast commit cleanup routine. This is called after every fast commit and
1162 * full commit. full is true if we are called after a full commit.
1163 */
ext4_fc_cleanup(journal_t * journal,int full)1164 static void ext4_fc_cleanup(journal_t *journal, int full)
1165 {
1166 struct super_block *sb = journal->j_private;
1167 struct ext4_sb_info *sbi = EXT4_SB(sb);
1168 struct ext4_inode_info *iter;
1169 struct ext4_fc_dentry_update *fc_dentry;
1170 struct list_head *pos, *n;
1171
1172 if (full && sbi->s_fc_bh)
1173 sbi->s_fc_bh = NULL;
1174
1175 jbd2_fc_release_bufs(journal);
1176
1177 spin_lock(&sbi->s_fc_lock);
1178 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1179 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1180 list_del_init(&iter->i_fc_list);
1181 ext4_clear_inode_state(&iter->vfs_inode,
1182 EXT4_STATE_FC_COMMITTING);
1183 ext4_fc_reset_inode(&iter->vfs_inode);
1184 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1185 smp_mb();
1186 #if (BITS_PER_LONG < 64)
1187 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1188 #else
1189 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1190 #endif
1191 }
1192
1193 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1194 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1195 struct ext4_fc_dentry_update,
1196 fcd_list);
1197 list_del_init(&fc_dentry->fcd_list);
1198 spin_unlock(&sbi->s_fc_lock);
1199
1200 if (fc_dentry->fcd_name.name &&
1201 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1202 kfree(fc_dentry->fcd_name.name);
1203 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1204 spin_lock(&sbi->s_fc_lock);
1205 }
1206
1207 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1208 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1209 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1210 &sbi->s_fc_q[FC_Q_STAGING]);
1211
1212 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1213 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1214
1215 if (full)
1216 sbi->s_fc_bytes = 0;
1217 spin_unlock(&sbi->s_fc_lock);
1218 trace_ext4_fc_stats(sb);
1219 }
1220
1221 /* Ext4 Replay Path Routines */
1222
1223 /* Get length of a particular tlv */
ext4_fc_tag_len(struct ext4_fc_tl * tl)1224 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1225 {
1226 return le16_to_cpu(tl->fc_len);
1227 }
1228
1229 /* Get a pointer to "value" of a tlv */
ext4_fc_tag_val(struct ext4_fc_tl * tl)1230 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1231 {
1232 return (u8 *)tl + sizeof(*tl);
1233 }
1234
1235 /* Helper struct for dentry replay routines */
1236 struct dentry_info_args {
1237 int parent_ino, dname_len, ino, inode_len;
1238 char *dname;
1239 };
1240
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl)1241 static inline void tl_to_darg(struct dentry_info_args *darg,
1242 struct ext4_fc_tl *tl)
1243 {
1244 struct ext4_fc_dentry_info *fcd;
1245
1246 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1247
1248 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1249 darg->ino = le32_to_cpu(fcd->fc_ino);
1250 darg->dname = fcd->fc_dname;
1251 darg->dname_len = ext4_fc_tag_len(tl) -
1252 sizeof(struct ext4_fc_dentry_info);
1253 }
1254
1255 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl)1256 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1257 {
1258 struct inode *inode, *old_parent;
1259 struct qstr entry;
1260 struct dentry_info_args darg;
1261 int ret = 0;
1262
1263 tl_to_darg(&darg, tl);
1264
1265 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1266 darg.parent_ino, darg.dname_len);
1267
1268 entry.name = darg.dname;
1269 entry.len = darg.dname_len;
1270 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1271
1272 if (IS_ERR_OR_NULL(inode)) {
1273 jbd_debug(1, "Inode %d not found", darg.ino);
1274 return 0;
1275 }
1276
1277 old_parent = ext4_iget(sb, darg.parent_ino,
1278 EXT4_IGET_NORMAL);
1279 if (IS_ERR_OR_NULL(old_parent)) {
1280 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1281 iput(inode);
1282 return 0;
1283 }
1284
1285 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1286 /* -ENOENT ok coz it might not exist anymore. */
1287 if (ret == -ENOENT)
1288 ret = 0;
1289 iput(old_parent);
1290 iput(inode);
1291 return ret;
1292 }
1293
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1294 static int ext4_fc_replay_link_internal(struct super_block *sb,
1295 struct dentry_info_args *darg,
1296 struct inode *inode)
1297 {
1298 struct inode *dir = NULL;
1299 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1300 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1301 int ret = 0;
1302
1303 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1304 if (IS_ERR(dir)) {
1305 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1306 dir = NULL;
1307 goto out;
1308 }
1309
1310 dentry_dir = d_obtain_alias(dir);
1311 if (IS_ERR(dentry_dir)) {
1312 jbd_debug(1, "Failed to obtain dentry");
1313 dentry_dir = NULL;
1314 goto out;
1315 }
1316
1317 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1318 if (!dentry_inode) {
1319 jbd_debug(1, "Inode dentry not created.");
1320 ret = -ENOMEM;
1321 goto out;
1322 }
1323
1324 ret = __ext4_link(dir, inode, dentry_inode);
1325 /*
1326 * It's possible that link already existed since data blocks
1327 * for the dir in question got persisted before we crashed OR
1328 * we replayed this tag and crashed before the entire replay
1329 * could complete.
1330 */
1331 if (ret && ret != -EEXIST) {
1332 jbd_debug(1, "Failed to link\n");
1333 goto out;
1334 }
1335
1336 ret = 0;
1337 out:
1338 if (dentry_dir) {
1339 d_drop(dentry_dir);
1340 dput(dentry_dir);
1341 } else if (dir) {
1342 iput(dir);
1343 }
1344 if (dentry_inode) {
1345 d_drop(dentry_inode);
1346 dput(dentry_inode);
1347 }
1348
1349 return ret;
1350 }
1351
1352 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl)1353 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1354 {
1355 struct inode *inode;
1356 struct dentry_info_args darg;
1357 int ret = 0;
1358
1359 tl_to_darg(&darg, tl);
1360 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1361 darg.parent_ino, darg.dname_len);
1362
1363 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1364 if (IS_ERR_OR_NULL(inode)) {
1365 jbd_debug(1, "Inode not found.");
1366 return 0;
1367 }
1368
1369 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1370 iput(inode);
1371 return ret;
1372 }
1373
1374 /*
1375 * Record all the modified inodes during replay. We use this later to setup
1376 * block bitmaps correctly.
1377 */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1378 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1379 {
1380 struct ext4_fc_replay_state *state;
1381 int i;
1382
1383 state = &EXT4_SB(sb)->s_fc_replay_state;
1384 for (i = 0; i < state->fc_modified_inodes_used; i++)
1385 if (state->fc_modified_inodes[i] == ino)
1386 return 0;
1387 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1388 state->fc_modified_inodes_size +=
1389 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1390 state->fc_modified_inodes = krealloc(
1391 state->fc_modified_inodes, sizeof(int) *
1392 state->fc_modified_inodes_size,
1393 GFP_KERNEL);
1394 if (!state->fc_modified_inodes)
1395 return -ENOMEM;
1396 }
1397 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1398 return 0;
1399 }
1400
1401 /*
1402 * Inode replay function
1403 */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl)1404 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1405 {
1406 struct ext4_fc_inode *fc_inode;
1407 struct ext4_inode *raw_inode;
1408 struct ext4_inode *raw_fc_inode;
1409 struct inode *inode = NULL;
1410 struct ext4_iloc iloc;
1411 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1412 struct ext4_extent_header *eh;
1413
1414 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1415
1416 ino = le32_to_cpu(fc_inode->fc_ino);
1417 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1418
1419 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1420 if (!IS_ERR_OR_NULL(inode)) {
1421 ext4_ext_clear_bb(inode);
1422 iput(inode);
1423 }
1424
1425 ext4_fc_record_modified_inode(sb, ino);
1426
1427 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1428 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1429 if (ret)
1430 goto out;
1431
1432 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1433 raw_inode = ext4_raw_inode(&iloc);
1434
1435 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1436 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1437 inode_len - offsetof(struct ext4_inode, i_generation));
1438 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1439 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1440 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1441 memset(eh, 0, sizeof(*eh));
1442 eh->eh_magic = EXT4_EXT_MAGIC;
1443 eh->eh_max = cpu_to_le16(
1444 (sizeof(raw_inode->i_block) -
1445 sizeof(struct ext4_extent_header))
1446 / sizeof(struct ext4_extent));
1447 }
1448 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1449 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1450 sizeof(raw_inode->i_block));
1451 }
1452
1453 /* Immediately update the inode on disk. */
1454 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1455 if (ret)
1456 goto out;
1457 ret = sync_dirty_buffer(iloc.bh);
1458 if (ret)
1459 goto out;
1460 ret = ext4_mark_inode_used(sb, ino);
1461 if (ret)
1462 goto out;
1463
1464 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1465 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1466 if (IS_ERR_OR_NULL(inode)) {
1467 jbd_debug(1, "Inode not found.");
1468 return -EFSCORRUPTED;
1469 }
1470
1471 /*
1472 * Our allocator could have made different decisions than before
1473 * crashing. This should be fixed but until then, we calculate
1474 * the number of blocks the inode.
1475 */
1476 ext4_ext_replay_set_iblocks(inode);
1477
1478 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1479 ext4_reset_inode_seed(inode);
1480
1481 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1482 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1483 sync_dirty_buffer(iloc.bh);
1484 brelse(iloc.bh);
1485 out:
1486 iput(inode);
1487 if (!ret)
1488 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1489
1490 return 0;
1491 }
1492
1493 /*
1494 * Dentry create replay function.
1495 *
1496 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1497 * inode for which we are trying to create a dentry here, should already have
1498 * been replayed before we start here.
1499 */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl)1500 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1501 {
1502 int ret = 0;
1503 struct inode *inode = NULL;
1504 struct inode *dir = NULL;
1505 struct dentry_info_args darg;
1506
1507 tl_to_darg(&darg, tl);
1508
1509 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1510 darg.parent_ino, darg.dname_len);
1511
1512 /* This takes care of update group descriptor and other metadata */
1513 ret = ext4_mark_inode_used(sb, darg.ino);
1514 if (ret)
1515 goto out;
1516
1517 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1518 if (IS_ERR_OR_NULL(inode)) {
1519 jbd_debug(1, "inode %d not found.", darg.ino);
1520 inode = NULL;
1521 ret = -EINVAL;
1522 goto out;
1523 }
1524
1525 if (S_ISDIR(inode->i_mode)) {
1526 /*
1527 * If we are creating a directory, we need to make sure that the
1528 * dot and dot dot dirents are setup properly.
1529 */
1530 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1531 if (IS_ERR_OR_NULL(dir)) {
1532 jbd_debug(1, "Dir %d not found.", darg.ino);
1533 goto out;
1534 }
1535 ret = ext4_init_new_dir(NULL, dir, inode);
1536 iput(dir);
1537 if (ret) {
1538 ret = 0;
1539 goto out;
1540 }
1541 }
1542 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1543 if (ret)
1544 goto out;
1545 set_nlink(inode, 1);
1546 ext4_mark_inode_dirty(NULL, inode);
1547 out:
1548 if (inode)
1549 iput(inode);
1550 return ret;
1551 }
1552
1553 /*
1554 * Record physical disk regions which are in use as per fast commit area. Our
1555 * simple replay phase allocator excludes these regions from allocation.
1556 */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len)1557 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1558 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1559 {
1560 struct ext4_fc_replay_state *state;
1561 struct ext4_fc_alloc_region *region;
1562
1563 state = &EXT4_SB(sb)->s_fc_replay_state;
1564 if (state->fc_regions_used == state->fc_regions_size) {
1565 state->fc_regions_size +=
1566 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1567 state->fc_regions = krealloc(
1568 state->fc_regions,
1569 state->fc_regions_size *
1570 sizeof(struct ext4_fc_alloc_region),
1571 GFP_KERNEL);
1572 if (!state->fc_regions)
1573 return -ENOMEM;
1574 }
1575 region = &state->fc_regions[state->fc_regions_used++];
1576 region->ino = ino;
1577 region->lblk = lblk;
1578 region->pblk = pblk;
1579 region->len = len;
1580
1581 return 0;
1582 }
1583
1584 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl)1585 static int ext4_fc_replay_add_range(struct super_block *sb,
1586 struct ext4_fc_tl *tl)
1587 {
1588 struct ext4_fc_add_range *fc_add_ex;
1589 struct ext4_extent newex, *ex;
1590 struct inode *inode;
1591 ext4_lblk_t start, cur;
1592 int remaining, len;
1593 ext4_fsblk_t start_pblk;
1594 struct ext4_map_blocks map;
1595 struct ext4_ext_path *path = NULL;
1596 int ret;
1597
1598 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1599 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1600
1601 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1602 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1603 ext4_ext_get_actual_len(ex));
1604
1605 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1606 EXT4_IGET_NORMAL);
1607 if (IS_ERR_OR_NULL(inode)) {
1608 jbd_debug(1, "Inode not found.");
1609 return 0;
1610 }
1611
1612 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1613
1614 start = le32_to_cpu(ex->ee_block);
1615 start_pblk = ext4_ext_pblock(ex);
1616 len = ext4_ext_get_actual_len(ex);
1617
1618 cur = start;
1619 remaining = len;
1620 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1621 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1622 inode->i_ino);
1623
1624 while (remaining > 0) {
1625 map.m_lblk = cur;
1626 map.m_len = remaining;
1627 map.m_pblk = 0;
1628 ret = ext4_map_blocks(NULL, inode, &map, 0);
1629
1630 if (ret < 0) {
1631 iput(inode);
1632 return 0;
1633 }
1634
1635 if (ret == 0) {
1636 /* Range is not mapped */
1637 path = ext4_find_extent(inode, cur, NULL, 0);
1638 if (IS_ERR(path)) {
1639 iput(inode);
1640 return 0;
1641 }
1642 memset(&newex, 0, sizeof(newex));
1643 newex.ee_block = cpu_to_le32(cur);
1644 ext4_ext_store_pblock(
1645 &newex, start_pblk + cur - start);
1646 newex.ee_len = cpu_to_le16(map.m_len);
1647 if (ext4_ext_is_unwritten(ex))
1648 ext4_ext_mark_unwritten(&newex);
1649 down_write(&EXT4_I(inode)->i_data_sem);
1650 ret = ext4_ext_insert_extent(
1651 NULL, inode, &path, &newex, 0);
1652 up_write((&EXT4_I(inode)->i_data_sem));
1653 ext4_ext_drop_refs(path);
1654 kfree(path);
1655 if (ret) {
1656 iput(inode);
1657 return 0;
1658 }
1659 goto next;
1660 }
1661
1662 if (start_pblk + cur - start != map.m_pblk) {
1663 /*
1664 * Logical to physical mapping changed. This can happen
1665 * if this range was removed and then reallocated to
1666 * map to new physical blocks during a fast commit.
1667 */
1668 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1669 ext4_ext_is_unwritten(ex),
1670 start_pblk + cur - start);
1671 if (ret) {
1672 iput(inode);
1673 return 0;
1674 }
1675 /*
1676 * Mark the old blocks as free since they aren't used
1677 * anymore. We maintain an array of all the modified
1678 * inodes. In case these blocks are still used at either
1679 * a different logical range in the same inode or in
1680 * some different inode, we will mark them as allocated
1681 * at the end of the FC replay using our array of
1682 * modified inodes.
1683 */
1684 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1685 goto next;
1686 }
1687
1688 /* Range is mapped and needs a state change */
1689 jbd_debug(1, "Converting from %d to %d %lld",
1690 map.m_flags & EXT4_MAP_UNWRITTEN,
1691 ext4_ext_is_unwritten(ex), map.m_pblk);
1692 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1693 ext4_ext_is_unwritten(ex), map.m_pblk);
1694 if (ret) {
1695 iput(inode);
1696 return 0;
1697 }
1698 /*
1699 * We may have split the extent tree while toggling the state.
1700 * Try to shrink the extent tree now.
1701 */
1702 ext4_ext_replay_shrink_inode(inode, start + len);
1703 next:
1704 cur += map.m_len;
1705 remaining -= map.m_len;
1706 }
1707 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1708 sb->s_blocksize_bits);
1709 iput(inode);
1710 return 0;
1711 }
1712
1713 /* Replay DEL_RANGE tag */
1714 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl)1715 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1716 {
1717 struct inode *inode;
1718 struct ext4_fc_del_range *lrange;
1719 struct ext4_map_blocks map;
1720 ext4_lblk_t cur, remaining;
1721 int ret;
1722
1723 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1724 cur = le32_to_cpu(lrange->fc_lblk);
1725 remaining = le32_to_cpu(lrange->fc_len);
1726
1727 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1728 le32_to_cpu(lrange->fc_ino), cur, remaining);
1729
1730 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1731 if (IS_ERR_OR_NULL(inode)) {
1732 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1733 return 0;
1734 }
1735
1736 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1737
1738 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1739 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1740 le32_to_cpu(lrange->fc_len));
1741 while (remaining > 0) {
1742 map.m_lblk = cur;
1743 map.m_len = remaining;
1744
1745 ret = ext4_map_blocks(NULL, inode, &map, 0);
1746 if (ret < 0) {
1747 iput(inode);
1748 return 0;
1749 }
1750 if (ret > 0) {
1751 remaining -= ret;
1752 cur += ret;
1753 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1754 } else {
1755 remaining -= map.m_len;
1756 cur += map.m_len;
1757 }
1758 }
1759
1760 ret = ext4_punch_hole(inode,
1761 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1762 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1763 if (ret)
1764 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1765 ext4_ext_replay_shrink_inode(inode,
1766 i_size_read(inode) >> sb->s_blocksize_bits);
1767 ext4_mark_inode_dirty(NULL, inode);
1768 iput(inode);
1769
1770 return 0;
1771 }
1772
tag2str(u16 tag)1773 static inline const char *tag2str(u16 tag)
1774 {
1775 switch (tag) {
1776 case EXT4_FC_TAG_LINK:
1777 return "TAG_ADD_ENTRY";
1778 case EXT4_FC_TAG_UNLINK:
1779 return "TAG_DEL_ENTRY";
1780 case EXT4_FC_TAG_ADD_RANGE:
1781 return "TAG_ADD_RANGE";
1782 case EXT4_FC_TAG_CREAT:
1783 return "TAG_CREAT_DENTRY";
1784 case EXT4_FC_TAG_DEL_RANGE:
1785 return "TAG_DEL_RANGE";
1786 case EXT4_FC_TAG_INODE:
1787 return "TAG_INODE";
1788 case EXT4_FC_TAG_PAD:
1789 return "TAG_PAD";
1790 case EXT4_FC_TAG_TAIL:
1791 return "TAG_TAIL";
1792 case EXT4_FC_TAG_HEAD:
1793 return "TAG_HEAD";
1794 default:
1795 return "TAG_ERROR";
1796 }
1797 }
1798
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1799 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1800 {
1801 struct ext4_fc_replay_state *state;
1802 struct inode *inode;
1803 struct ext4_ext_path *path = NULL;
1804 struct ext4_map_blocks map;
1805 int i, ret, j;
1806 ext4_lblk_t cur, end;
1807
1808 state = &EXT4_SB(sb)->s_fc_replay_state;
1809 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1810 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1811 EXT4_IGET_NORMAL);
1812 if (IS_ERR_OR_NULL(inode)) {
1813 jbd_debug(1, "Inode %d not found.",
1814 state->fc_modified_inodes[i]);
1815 continue;
1816 }
1817 cur = 0;
1818 end = EXT_MAX_BLOCKS;
1819 while (cur < end) {
1820 map.m_lblk = cur;
1821 map.m_len = end - cur;
1822
1823 ret = ext4_map_blocks(NULL, inode, &map, 0);
1824 if (ret < 0)
1825 break;
1826
1827 if (ret > 0) {
1828 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1829 if (!IS_ERR_OR_NULL(path)) {
1830 for (j = 0; j < path->p_depth; j++)
1831 ext4_mb_mark_bb(inode->i_sb,
1832 path[j].p_block, 1, 1);
1833 ext4_ext_drop_refs(path);
1834 kfree(path);
1835 }
1836 cur += ret;
1837 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1838 map.m_len, 1);
1839 } else {
1840 cur = cur + (map.m_len ? map.m_len : 1);
1841 }
1842 }
1843 iput(inode);
1844 }
1845 }
1846
1847 /*
1848 * Check if block is in excluded regions for block allocation. The simple
1849 * allocator that runs during replay phase is calls this function to see
1850 * if it is okay to use a block.
1851 */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1852 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1853 {
1854 int i;
1855 struct ext4_fc_replay_state *state;
1856
1857 state = &EXT4_SB(sb)->s_fc_replay_state;
1858 for (i = 0; i < state->fc_regions_valid; i++) {
1859 if (state->fc_regions[i].ino == 0 ||
1860 state->fc_regions[i].len == 0)
1861 continue;
1862 if (blk >= state->fc_regions[i].pblk &&
1863 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1864 return true;
1865 }
1866 return false;
1867 }
1868
1869 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1870 void ext4_fc_replay_cleanup(struct super_block *sb)
1871 {
1872 struct ext4_sb_info *sbi = EXT4_SB(sb);
1873
1874 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1875 kfree(sbi->s_fc_replay_state.fc_regions);
1876 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1877 }
1878
1879 /*
1880 * Recovery Scan phase handler
1881 *
1882 * This function is called during the scan phase and is responsible
1883 * for doing following things:
1884 * - Make sure the fast commit area has valid tags for replay
1885 * - Count number of tags that need to be replayed by the replay handler
1886 * - Verify CRC
1887 * - Create a list of excluded blocks for allocation during replay phase
1888 *
1889 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1890 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1891 * to indicate that scan has finished and JBD2 can now start replay phase.
1892 * It returns a negative error to indicate that there was an error. At the end
1893 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1894 * to indicate the number of tags that need to replayed during the replay phase.
1895 */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)1896 static int ext4_fc_replay_scan(journal_t *journal,
1897 struct buffer_head *bh, int off,
1898 tid_t expected_tid)
1899 {
1900 struct super_block *sb = journal->j_private;
1901 struct ext4_sb_info *sbi = EXT4_SB(sb);
1902 struct ext4_fc_replay_state *state;
1903 int ret = JBD2_FC_REPLAY_CONTINUE;
1904 struct ext4_fc_add_range *ext;
1905 struct ext4_fc_tl *tl;
1906 struct ext4_fc_tail *tail;
1907 __u8 *start, *end;
1908 struct ext4_fc_head *head;
1909 struct ext4_extent *ex;
1910
1911 state = &sbi->s_fc_replay_state;
1912
1913 start = (u8 *)bh->b_data;
1914 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1915
1916 if (state->fc_replay_expected_off == 0) {
1917 state->fc_cur_tag = 0;
1918 state->fc_replay_num_tags = 0;
1919 state->fc_crc = 0;
1920 state->fc_regions = NULL;
1921 state->fc_regions_valid = state->fc_regions_used =
1922 state->fc_regions_size = 0;
1923 /* Check if we can stop early */
1924 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1925 != EXT4_FC_TAG_HEAD)
1926 return 0;
1927 }
1928
1929 if (off != state->fc_replay_expected_off) {
1930 ret = -EFSCORRUPTED;
1931 goto out_err;
1932 }
1933
1934 state->fc_replay_expected_off++;
1935 fc_for_each_tl(start, end, tl) {
1936 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1937 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1938 switch (le16_to_cpu(tl->fc_tag)) {
1939 case EXT4_FC_TAG_ADD_RANGE:
1940 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1941 ex = (struct ext4_extent *)&ext->fc_ex;
1942 ret = ext4_fc_record_regions(sb,
1943 le32_to_cpu(ext->fc_ino),
1944 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1945 ext4_ext_get_actual_len(ex));
1946 if (ret < 0)
1947 break;
1948 ret = JBD2_FC_REPLAY_CONTINUE;
1949 fallthrough;
1950 case EXT4_FC_TAG_DEL_RANGE:
1951 case EXT4_FC_TAG_LINK:
1952 case EXT4_FC_TAG_UNLINK:
1953 case EXT4_FC_TAG_CREAT:
1954 case EXT4_FC_TAG_INODE:
1955 case EXT4_FC_TAG_PAD:
1956 state->fc_cur_tag++;
1957 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1958 sizeof(*tl) + ext4_fc_tag_len(tl));
1959 break;
1960 case EXT4_FC_TAG_TAIL:
1961 state->fc_cur_tag++;
1962 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1963 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1964 sizeof(*tl) +
1965 offsetof(struct ext4_fc_tail,
1966 fc_crc));
1967 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1968 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1969 state->fc_replay_num_tags = state->fc_cur_tag;
1970 state->fc_regions_valid =
1971 state->fc_regions_used;
1972 } else {
1973 ret = state->fc_replay_num_tags ?
1974 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1975 }
1976 state->fc_crc = 0;
1977 break;
1978 case EXT4_FC_TAG_HEAD:
1979 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1980 if (le32_to_cpu(head->fc_features) &
1981 ~EXT4_FC_SUPPORTED_FEATURES) {
1982 ret = -EOPNOTSUPP;
1983 break;
1984 }
1985 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1986 ret = JBD2_FC_REPLAY_STOP;
1987 break;
1988 }
1989 state->fc_cur_tag++;
1990 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1991 sizeof(*tl) + ext4_fc_tag_len(tl));
1992 break;
1993 default:
1994 ret = state->fc_replay_num_tags ?
1995 JBD2_FC_REPLAY_STOP : -ECANCELED;
1996 }
1997 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1998 break;
1999 }
2000
2001 out_err:
2002 trace_ext4_fc_replay_scan(sb, ret, off);
2003 return ret;
2004 }
2005
2006 /*
2007 * Main recovery path entry point.
2008 * The meaning of return codes is similar as above.
2009 */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2010 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2011 enum passtype pass, int off, tid_t expected_tid)
2012 {
2013 struct super_block *sb = journal->j_private;
2014 struct ext4_sb_info *sbi = EXT4_SB(sb);
2015 struct ext4_fc_tl *tl;
2016 __u8 *start, *end;
2017 int ret = JBD2_FC_REPLAY_CONTINUE;
2018 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2019 struct ext4_fc_tail *tail;
2020
2021 if (pass == PASS_SCAN) {
2022 state->fc_current_pass = PASS_SCAN;
2023 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2024 }
2025
2026 if (state->fc_current_pass != pass) {
2027 state->fc_current_pass = pass;
2028 sbi->s_mount_state |= EXT4_FC_REPLAY;
2029 }
2030 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2031 jbd_debug(1, "Replay stops\n");
2032 ext4_fc_set_bitmaps_and_counters(sb);
2033 return 0;
2034 }
2035
2036 #ifdef CONFIG_EXT4_DEBUG
2037 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2038 pr_warn("Dropping fc block %d because max_replay set\n", off);
2039 return JBD2_FC_REPLAY_STOP;
2040 }
2041 #endif
2042
2043 start = (u8 *)bh->b_data;
2044 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2045
2046 fc_for_each_tl(start, end, tl) {
2047 if (state->fc_replay_num_tags == 0) {
2048 ret = JBD2_FC_REPLAY_STOP;
2049 ext4_fc_set_bitmaps_and_counters(sb);
2050 break;
2051 }
2052 jbd_debug(3, "Replay phase, tag:%s\n",
2053 tag2str(le16_to_cpu(tl->fc_tag)));
2054 state->fc_replay_num_tags--;
2055 switch (le16_to_cpu(tl->fc_tag)) {
2056 case EXT4_FC_TAG_LINK:
2057 ret = ext4_fc_replay_link(sb, tl);
2058 break;
2059 case EXT4_FC_TAG_UNLINK:
2060 ret = ext4_fc_replay_unlink(sb, tl);
2061 break;
2062 case EXT4_FC_TAG_ADD_RANGE:
2063 ret = ext4_fc_replay_add_range(sb, tl);
2064 break;
2065 case EXT4_FC_TAG_CREAT:
2066 ret = ext4_fc_replay_create(sb, tl);
2067 break;
2068 case EXT4_FC_TAG_DEL_RANGE:
2069 ret = ext4_fc_replay_del_range(sb, tl);
2070 break;
2071 case EXT4_FC_TAG_INODE:
2072 ret = ext4_fc_replay_inode(sb, tl);
2073 break;
2074 case EXT4_FC_TAG_PAD:
2075 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2076 ext4_fc_tag_len(tl), 0);
2077 break;
2078 case EXT4_FC_TAG_TAIL:
2079 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2080 ext4_fc_tag_len(tl), 0);
2081 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2082 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2083 break;
2084 case EXT4_FC_TAG_HEAD:
2085 break;
2086 default:
2087 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2088 ext4_fc_tag_len(tl), 0);
2089 ret = -ECANCELED;
2090 break;
2091 }
2092 if (ret < 0)
2093 break;
2094 ret = JBD2_FC_REPLAY_CONTINUE;
2095 }
2096 return ret;
2097 }
2098
ext4_fc_init(struct super_block * sb,journal_t * journal)2099 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2100 {
2101 /*
2102 * We set replay callback even if fast commit disabled because we may
2103 * could still have fast commit blocks that need to be replayed even if
2104 * fast commit has now been turned off.
2105 */
2106 journal->j_fc_replay_callback = ext4_fc_replay;
2107 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2108 return;
2109 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2110 }
2111
2112 static const char *fc_ineligible_reasons[] = {
2113 "Extended attributes changed",
2114 "Cross rename",
2115 "Journal flag changed",
2116 "Insufficient memory",
2117 "Swap boot",
2118 "Resize",
2119 "Dir renamed",
2120 "Falloc range op",
2121 "Data journalling",
2122 "FC Commit Failed"
2123 };
2124
ext4_fc_info_show(struct seq_file * seq,void * v)2125 int ext4_fc_info_show(struct seq_file *seq, void *v)
2126 {
2127 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2128 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2129 int i;
2130
2131 if (v != SEQ_START_TOKEN)
2132 return 0;
2133
2134 seq_printf(seq,
2135 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2136 stats->fc_num_commits, stats->fc_ineligible_commits,
2137 stats->fc_numblks,
2138 div_u64(sbi->s_fc_avg_commit_time, 1000));
2139 seq_puts(seq, "Ineligible reasons:\n");
2140 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2141 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2142 stats->fc_ineligible_reason_count[i]);
2143
2144 return 0;
2145 }
2146
ext4_fc_init_dentry_cache(void)2147 int __init ext4_fc_init_dentry_cache(void)
2148 {
2149 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2150 SLAB_RECLAIM_ACCOUNT);
2151
2152 if (ext4_fc_dentry_cachep == NULL)
2153 return -ENOMEM;
2154
2155 return 0;
2156 }
2157