1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/kthread.h>
7 #include <linux/pagemap.h>
8 
9 #include "ctree.h"
10 #include "disk-io.h"
11 #include "free-space-cache.h"
12 #include "inode-map.h"
13 #include "transaction.h"
14 
caching_kthread(void * data)15 static int caching_kthread(void *data)
16 {
17 	struct btrfs_root *root = data;
18 	struct btrfs_fs_info *fs_info = root->fs_info;
19 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
20 	struct btrfs_key key;
21 	struct btrfs_path *path;
22 	struct extent_buffer *leaf;
23 	u64 last = (u64)-1;
24 	int slot;
25 	int ret;
26 
27 	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
28 		return 0;
29 
30 	path = btrfs_alloc_path();
31 	if (!path)
32 		return -ENOMEM;
33 
34 	/* Since the commit root is read-only, we can safely skip locking. */
35 	path->skip_locking = 1;
36 	path->search_commit_root = 1;
37 	path->reada = READA_FORWARD;
38 
39 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
40 	key.offset = 0;
41 	key.type = BTRFS_INODE_ITEM_KEY;
42 again:
43 	/* need to make sure the commit_root doesn't disappear */
44 	down_read(&fs_info->commit_root_sem);
45 
46 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
47 	if (ret < 0)
48 		goto out;
49 
50 	while (1) {
51 		if (btrfs_fs_closing(fs_info))
52 			goto out;
53 
54 		leaf = path->nodes[0];
55 		slot = path->slots[0];
56 		if (slot >= btrfs_header_nritems(leaf)) {
57 			ret = btrfs_next_leaf(root, path);
58 			if (ret < 0)
59 				goto out;
60 			else if (ret > 0)
61 				break;
62 
63 			if (need_resched() ||
64 			    btrfs_transaction_in_commit(fs_info)) {
65 				leaf = path->nodes[0];
66 
67 				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
68 					break;
69 
70 				/*
71 				 * Save the key so we can advances forward
72 				 * in the next search.
73 				 */
74 				btrfs_item_key_to_cpu(leaf, &key, 0);
75 				btrfs_release_path(path);
76 				root->ino_cache_progress = last;
77 				up_read(&fs_info->commit_root_sem);
78 				schedule_timeout(1);
79 				goto again;
80 			} else
81 				continue;
82 		}
83 
84 		btrfs_item_key_to_cpu(leaf, &key, slot);
85 
86 		if (key.type != BTRFS_INODE_ITEM_KEY)
87 			goto next;
88 
89 		if (key.objectid >= root->highest_objectid)
90 			break;
91 
92 		if (last != (u64)-1 && last + 1 != key.objectid) {
93 			__btrfs_add_free_space(fs_info, ctl, last + 1,
94 					       key.objectid - last - 1);
95 			wake_up(&root->ino_cache_wait);
96 		}
97 
98 		last = key.objectid;
99 next:
100 		path->slots[0]++;
101 	}
102 
103 	if (last < root->highest_objectid - 1) {
104 		__btrfs_add_free_space(fs_info, ctl, last + 1,
105 				       root->highest_objectid - last - 1);
106 	}
107 
108 	spin_lock(&root->ino_cache_lock);
109 	root->ino_cache_state = BTRFS_CACHE_FINISHED;
110 	spin_unlock(&root->ino_cache_lock);
111 
112 	root->ino_cache_progress = (u64)-1;
113 	btrfs_unpin_free_ino(root);
114 out:
115 	wake_up(&root->ino_cache_wait);
116 	up_read(&fs_info->commit_root_sem);
117 
118 	btrfs_free_path(path);
119 
120 	return ret;
121 }
122 
start_caching(struct btrfs_root * root)123 static void start_caching(struct btrfs_root *root)
124 {
125 	struct btrfs_fs_info *fs_info = root->fs_info;
126 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
127 	struct task_struct *tsk;
128 	int ret;
129 	u64 objectid;
130 
131 	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
132 		return;
133 
134 	spin_lock(&root->ino_cache_lock);
135 	if (root->ino_cache_state != BTRFS_CACHE_NO) {
136 		spin_unlock(&root->ino_cache_lock);
137 		return;
138 	}
139 
140 	root->ino_cache_state = BTRFS_CACHE_STARTED;
141 	spin_unlock(&root->ino_cache_lock);
142 
143 	ret = load_free_ino_cache(fs_info, root);
144 	if (ret == 1) {
145 		spin_lock(&root->ino_cache_lock);
146 		root->ino_cache_state = BTRFS_CACHE_FINISHED;
147 		spin_unlock(&root->ino_cache_lock);
148 		return;
149 	}
150 
151 	/*
152 	 * It can be quite time-consuming to fill the cache by searching
153 	 * through the extent tree, and this can keep ino allocation path
154 	 * waiting. Therefore at start we quickly find out the highest
155 	 * inode number and we know we can use inode numbers which fall in
156 	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
157 	 */
158 	ret = btrfs_find_free_objectid(root, &objectid);
159 	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
160 		__btrfs_add_free_space(fs_info, ctl, objectid,
161 				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
162 	}
163 
164 	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
165 			  root->root_key.objectid);
166 	if (IS_ERR(tsk)) {
167 		btrfs_warn(fs_info, "failed to start inode caching task");
168 		btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
169 					     "disabling inode map caching");
170 	}
171 }
172 
btrfs_find_free_ino(struct btrfs_root * root,u64 * objectid)173 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
174 {
175 	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
176 		return btrfs_find_free_objectid(root, objectid);
177 
178 again:
179 	*objectid = btrfs_find_ino_for_alloc(root);
180 
181 	if (*objectid != 0)
182 		return 0;
183 
184 	start_caching(root);
185 
186 	wait_event(root->ino_cache_wait,
187 		   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
188 		   root->free_ino_ctl->free_space > 0);
189 
190 	if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
191 	    root->free_ino_ctl->free_space == 0)
192 		return -ENOSPC;
193 	else
194 		goto again;
195 }
196 
btrfs_return_ino(struct btrfs_root * root,u64 objectid)197 void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
198 {
199 	struct btrfs_fs_info *fs_info = root->fs_info;
200 	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
201 
202 	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
203 		return;
204 again:
205 	if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
206 		__btrfs_add_free_space(fs_info, pinned, objectid, 1);
207 	} else {
208 		down_write(&fs_info->commit_root_sem);
209 		spin_lock(&root->ino_cache_lock);
210 		if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
211 			spin_unlock(&root->ino_cache_lock);
212 			up_write(&fs_info->commit_root_sem);
213 			goto again;
214 		}
215 		spin_unlock(&root->ino_cache_lock);
216 
217 		start_caching(root);
218 
219 		__btrfs_add_free_space(fs_info, pinned, objectid, 1);
220 
221 		up_write(&fs_info->commit_root_sem);
222 	}
223 }
224 
225 /*
226  * When a transaction is committed, we'll move those inode numbers which are
227  * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
228  * others will just be dropped, because the commit root we were searching has
229  * changed.
230  *
231  * Must be called with root->fs_info->commit_root_sem held
232  */
btrfs_unpin_free_ino(struct btrfs_root * root)233 void btrfs_unpin_free_ino(struct btrfs_root *root)
234 {
235 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
236 	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
237 	spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
238 	struct btrfs_free_space *info;
239 	struct rb_node *n;
240 	u64 count;
241 
242 	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
243 		return;
244 
245 	while (1) {
246 		spin_lock(rbroot_lock);
247 		n = rb_first(rbroot);
248 		if (!n) {
249 			spin_unlock(rbroot_lock);
250 			break;
251 		}
252 
253 		info = rb_entry(n, struct btrfs_free_space, offset_index);
254 		BUG_ON(info->bitmap); /* Logic error */
255 
256 		if (info->offset > root->ino_cache_progress)
257 			count = 0;
258 		else
259 			count = min(root->ino_cache_progress - info->offset + 1,
260 				    info->bytes);
261 
262 		rb_erase(&info->offset_index, rbroot);
263 		spin_unlock(rbroot_lock);
264 		if (count)
265 			__btrfs_add_free_space(root->fs_info, ctl,
266 					       info->offset, count);
267 		kmem_cache_free(btrfs_free_space_cachep, info);
268 	}
269 }
270 
271 #define INIT_THRESHOLD	((SZ_32K / 2) / sizeof(struct btrfs_free_space))
272 #define INODES_PER_BITMAP (PAGE_SIZE * 8)
273 
274 /*
275  * The goal is to keep the memory used by the free_ino tree won't
276  * exceed the memory if we use bitmaps only.
277  */
recalculate_thresholds(struct btrfs_free_space_ctl * ctl)278 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
279 {
280 	struct btrfs_free_space *info;
281 	struct rb_node *n;
282 	int max_ino;
283 	int max_bitmaps;
284 
285 	n = rb_last(&ctl->free_space_offset);
286 	if (!n) {
287 		ctl->extents_thresh = INIT_THRESHOLD;
288 		return;
289 	}
290 	info = rb_entry(n, struct btrfs_free_space, offset_index);
291 
292 	/*
293 	 * Find the maximum inode number in the filesystem. Note we
294 	 * ignore the fact that this can be a bitmap, because we are
295 	 * not doing precise calculation.
296 	 */
297 	max_ino = info->bytes - 1;
298 
299 	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
300 	if (max_bitmaps <= ctl->total_bitmaps) {
301 		ctl->extents_thresh = 0;
302 		return;
303 	}
304 
305 	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
306 				PAGE_SIZE / sizeof(*info);
307 }
308 
309 /*
310  * We don't fall back to bitmap, if we are below the extents threshold
311  * or this chunk of inode numbers is a big one.
312  */
use_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)313 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
314 		       struct btrfs_free_space *info)
315 {
316 	if (ctl->free_extents < ctl->extents_thresh ||
317 	    info->bytes > INODES_PER_BITMAP / 10)
318 		return false;
319 
320 	return true;
321 }
322 
323 static const struct btrfs_free_space_op free_ino_op = {
324 	.recalc_thresholds	= recalculate_thresholds,
325 	.use_bitmap		= use_bitmap,
326 };
327 
pinned_recalc_thresholds(struct btrfs_free_space_ctl * ctl)328 static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
329 {
330 }
331 
pinned_use_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)332 static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
333 			      struct btrfs_free_space *info)
334 {
335 	/*
336 	 * We always use extents for two reasons:
337 	 *
338 	 * - The pinned tree is only used during the process of caching
339 	 *   work.
340 	 * - Make code simpler. See btrfs_unpin_free_ino().
341 	 */
342 	return false;
343 }
344 
345 static const struct btrfs_free_space_op pinned_free_ino_op = {
346 	.recalc_thresholds	= pinned_recalc_thresholds,
347 	.use_bitmap		= pinned_use_bitmap,
348 };
349 
btrfs_init_free_ino_ctl(struct btrfs_root * root)350 void btrfs_init_free_ino_ctl(struct btrfs_root *root)
351 {
352 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
353 	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
354 
355 	spin_lock_init(&ctl->tree_lock);
356 	ctl->unit = 1;
357 	ctl->start = 0;
358 	ctl->private = NULL;
359 	ctl->op = &free_ino_op;
360 	INIT_LIST_HEAD(&ctl->trimming_ranges);
361 	mutex_init(&ctl->cache_writeout_mutex);
362 
363 	/*
364 	 * Initially we allow to use 16K of ram to cache chunks of
365 	 * inode numbers before we resort to bitmaps. This is somewhat
366 	 * arbitrary, but it will be adjusted in runtime.
367 	 */
368 	ctl->extents_thresh = INIT_THRESHOLD;
369 
370 	spin_lock_init(&pinned->tree_lock);
371 	pinned->unit = 1;
372 	pinned->start = 0;
373 	pinned->private = NULL;
374 	pinned->extents_thresh = 0;
375 	pinned->op = &pinned_free_ino_op;
376 }
377 
btrfs_save_ino_cache(struct btrfs_root * root,struct btrfs_trans_handle * trans)378 int btrfs_save_ino_cache(struct btrfs_root *root,
379 			 struct btrfs_trans_handle *trans)
380 {
381 	struct btrfs_fs_info *fs_info = root->fs_info;
382 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
383 	struct btrfs_path *path;
384 	struct inode *inode;
385 	struct btrfs_block_rsv *rsv;
386 	struct extent_changeset *data_reserved = NULL;
387 	u64 num_bytes;
388 	u64 alloc_hint = 0;
389 	int ret;
390 	int prealloc;
391 	bool retry = false;
392 
393 	/* only fs tree and subvol/snap needs ino cache */
394 	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
395 	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
396 	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
397 		return 0;
398 
399 	/* Don't save inode cache if we are deleting this root */
400 	if (btrfs_root_refs(&root->root_item) == 0)
401 		return 0;
402 
403 	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
404 		return 0;
405 
406 	path = btrfs_alloc_path();
407 	if (!path)
408 		return -ENOMEM;
409 
410 	rsv = trans->block_rsv;
411 	trans->block_rsv = &fs_info->trans_block_rsv;
412 
413 	num_bytes = trans->bytes_reserved;
414 	/*
415 	 * 1 item for inode item insertion if need
416 	 * 4 items for inode item update (in the worst case)
417 	 * 1 items for slack space if we need do truncation
418 	 * 1 item for free space object
419 	 * 3 items for pre-allocation
420 	 */
421 	trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10);
422 	ret = btrfs_block_rsv_add(root, trans->block_rsv,
423 				  trans->bytes_reserved,
424 				  BTRFS_RESERVE_NO_FLUSH);
425 	if (ret)
426 		goto out;
427 	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
428 				      trans->bytes_reserved, 1);
429 again:
430 	inode = lookup_free_ino_inode(root, path);
431 	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
432 		ret = PTR_ERR(inode);
433 		goto out_release;
434 	}
435 
436 	if (IS_ERR(inode)) {
437 		BUG_ON(retry); /* Logic error */
438 		retry = true;
439 
440 		ret = create_free_ino_inode(root, trans, path);
441 		if (ret)
442 			goto out_release;
443 		goto again;
444 	}
445 
446 	BTRFS_I(inode)->generation = 0;
447 	ret = btrfs_update_inode(trans, root, inode);
448 	if (ret) {
449 		btrfs_abort_transaction(trans, ret);
450 		goto out_put;
451 	}
452 
453 	if (i_size_read(inode) > 0) {
454 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
455 		if (ret) {
456 			if (ret != -ENOSPC)
457 				btrfs_abort_transaction(trans, ret);
458 			goto out_put;
459 		}
460 	}
461 
462 	spin_lock(&root->ino_cache_lock);
463 	if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
464 		ret = -1;
465 		spin_unlock(&root->ino_cache_lock);
466 		goto out_put;
467 	}
468 	spin_unlock(&root->ino_cache_lock);
469 
470 	spin_lock(&ctl->tree_lock);
471 	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
472 	prealloc = ALIGN(prealloc, PAGE_SIZE);
473 	prealloc += ctl->total_bitmaps * PAGE_SIZE;
474 	spin_unlock(&ctl->tree_lock);
475 
476 	/* Just to make sure we have enough space */
477 	prealloc += 8 * PAGE_SIZE;
478 
479 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
480 	if (ret)
481 		goto out_put;
482 
483 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
484 					      prealloc, prealloc, &alloc_hint);
485 	if (ret) {
486 		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
487 		goto out_put;
488 	}
489 
490 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
491 	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
492 out_put:
493 	iput(inode);
494 out_release:
495 	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
496 				      trans->bytes_reserved, 0);
497 	btrfs_block_rsv_release(fs_info, trans->block_rsv,
498 				trans->bytes_reserved);
499 out:
500 	trans->block_rsv = rsv;
501 	trans->bytes_reserved = num_bytes;
502 
503 	btrfs_free_path(path);
504 	extent_changeset_free(data_reserved);
505 	return ret;
506 }
507 
btrfs_find_highest_objectid(struct btrfs_root * root,u64 * objectid)508 int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
509 {
510 	struct btrfs_path *path;
511 	int ret;
512 	struct extent_buffer *l;
513 	struct btrfs_key search_key;
514 	struct btrfs_key found_key;
515 	int slot;
516 
517 	path = btrfs_alloc_path();
518 	if (!path)
519 		return -ENOMEM;
520 
521 	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
522 	search_key.type = -1;
523 	search_key.offset = (u64)-1;
524 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
525 	if (ret < 0)
526 		goto error;
527 	BUG_ON(ret == 0); /* Corruption */
528 	if (path->slots[0] > 0) {
529 		slot = path->slots[0] - 1;
530 		l = path->nodes[0];
531 		btrfs_item_key_to_cpu(l, &found_key, slot);
532 		*objectid = max_t(u64, found_key.objectid,
533 				  BTRFS_FIRST_FREE_OBJECTID - 1);
534 	} else {
535 		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
536 	}
537 	ret = 0;
538 error:
539 	btrfs_free_path(path);
540 	return ret;
541 }
542 
btrfs_find_free_objectid(struct btrfs_root * root,u64 * objectid)543 int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
544 {
545 	int ret;
546 	mutex_lock(&root->objectid_mutex);
547 
548 	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
549 		btrfs_warn(root->fs_info,
550 			   "the objectid of root %llu reaches its highest value",
551 			   root->root_key.objectid);
552 		ret = -ENOSPC;
553 		goto out;
554 	}
555 
556 	*objectid = ++root->highest_objectid;
557 	ret = 0;
558 out:
559 	mutex_unlock(&root->objectid_mutex);
560 	return ret;
561 }
562