1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2009 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/pagemap.h>
8  #include <linux/writeback.h>
9  #include <linux/blkdev.h>
10  #include <linux/rbtree.h>
11  #include <linux/slab.h>
12  #include <linux/error-injection.h>
13  #include "ctree.h"
14  #include "disk-io.h"
15  #include "transaction.h"
16  #include "volumes.h"
17  #include "locking.h"
18  #include "btrfs_inode.h"
19  #include "async-thread.h"
20  #include "free-space-cache.h"
21  #include "qgroup.h"
22  #include "print-tree.h"
23  #include "delalloc-space.h"
24  #include "block-group.h"
25  #include "backref.h"
26  #include "misc.h"
27  #include "subpage.h"
28  #include "zoned.h"
29  #include "inode-item.h"
30  #include "space-info.h"
31  #include "fs.h"
32  #include "accessors.h"
33  #include "extent-tree.h"
34  #include "root-tree.h"
35  #include "file-item.h"
36  #include "relocation.h"
37  #include "super.h"
38  #include "tree-checker.h"
39  
40  /*
41   * Relocation overview
42   *
43   * [What does relocation do]
44   *
45   * The objective of relocation is to relocate all extents of the target block
46   * group to other block groups.
47   * This is utilized by resize (shrink only), profile converting, compacting
48   * space, or balance routine to spread chunks over devices.
49   *
50   * 		Before		|		After
51   * ------------------------------------------------------------------
52   *  BG A: 10 data extents	| BG A: deleted
53   *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
54   *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
55   *
56   * [How does relocation work]
57   *
58   * 1.   Mark the target block group read-only
59   *      New extents won't be allocated from the target block group.
60   *
61   * 2.1  Record each extent in the target block group
62   *      To build a proper map of extents to be relocated.
63   *
64   * 2.2  Build data reloc tree and reloc trees
65   *      Data reloc tree will contain an inode, recording all newly relocated
66   *      data extents.
67   *      There will be only one data reloc tree for one data block group.
68   *
69   *      Reloc tree will be a special snapshot of its source tree, containing
70   *      relocated tree blocks.
71   *      Each tree referring to a tree block in target block group will get its
72   *      reloc tree built.
73   *
74   * 2.3  Swap source tree with its corresponding reloc tree
75   *      Each involved tree only refers to new extents after swap.
76   *
77   * 3.   Cleanup reloc trees and data reloc tree.
78   *      As old extents in the target block group are still referenced by reloc
79   *      trees, we need to clean them up before really freeing the target block
80   *      group.
81   *
82   * The main complexity is in steps 2.2 and 2.3.
83   *
84   * The entry point of relocation is relocate_block_group() function.
85   */
86  
87  #define RELOCATION_RESERVED_NODES	256
88  /*
89   * map address of tree root to tree
90   */
91  struct mapping_node {
92  	struct {
93  		struct rb_node rb_node;
94  		u64 bytenr;
95  	}; /* Use rb_simle_node for search/insert */
96  	void *data;
97  };
98  
99  struct mapping_tree {
100  	struct rb_root rb_root;
101  	spinlock_t lock;
102  };
103  
104  /*
105   * present a tree block to process
106   */
107  struct tree_block {
108  	struct {
109  		struct rb_node rb_node;
110  		u64 bytenr;
111  	}; /* Use rb_simple_node for search/insert */
112  	u64 owner;
113  	struct btrfs_key key;
114  	unsigned int level:8;
115  	unsigned int key_ready:1;
116  };
117  
118  #define MAX_EXTENTS 128
119  
120  struct file_extent_cluster {
121  	u64 start;
122  	u64 end;
123  	u64 boundary[MAX_EXTENTS];
124  	unsigned int nr;
125  };
126  
127  struct reloc_control {
128  	/* block group to relocate */
129  	struct btrfs_block_group *block_group;
130  	/* extent tree */
131  	struct btrfs_root *extent_root;
132  	/* inode for moving data */
133  	struct inode *data_inode;
134  
135  	struct btrfs_block_rsv *block_rsv;
136  
137  	struct btrfs_backref_cache backref_cache;
138  
139  	struct file_extent_cluster cluster;
140  	/* tree blocks have been processed */
141  	struct extent_io_tree processed_blocks;
142  	/* map start of tree root to corresponding reloc tree */
143  	struct mapping_tree reloc_root_tree;
144  	/* list of reloc trees */
145  	struct list_head reloc_roots;
146  	/* list of subvolume trees that get relocated */
147  	struct list_head dirty_subvol_roots;
148  	/* size of metadata reservation for merging reloc trees */
149  	u64 merging_rsv_size;
150  	/* size of relocated tree nodes */
151  	u64 nodes_relocated;
152  	/* reserved size for block group relocation*/
153  	u64 reserved_bytes;
154  
155  	u64 search_start;
156  	u64 extents_found;
157  
158  	unsigned int stage:8;
159  	unsigned int create_reloc_tree:1;
160  	unsigned int merge_reloc_tree:1;
161  	unsigned int found_file_extent:1;
162  };
163  
164  /* stages of data relocation */
165  #define MOVE_DATA_EXTENTS	0
166  #define UPDATE_DATA_PTRS	1
167  
mark_block_processed(struct reloc_control * rc,struct btrfs_backref_node * node)168  static void mark_block_processed(struct reloc_control *rc,
169  				 struct btrfs_backref_node *node)
170  {
171  	u32 blocksize;
172  
173  	if (node->level == 0 ||
174  	    in_range(node->bytenr, rc->block_group->start,
175  		     rc->block_group->length)) {
176  		blocksize = rc->extent_root->fs_info->nodesize;
177  		set_extent_bit(&rc->processed_blocks, node->bytenr,
178  			       node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
179  	}
180  	node->processed = 1;
181  }
182  
183  
mapping_tree_init(struct mapping_tree * tree)184  static void mapping_tree_init(struct mapping_tree *tree)
185  {
186  	tree->rb_root = RB_ROOT;
187  	spin_lock_init(&tree->lock);
188  }
189  
190  /*
191   * walk up backref nodes until reach node presents tree root
192   */
walk_up_backref(struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[],int * index)193  static struct btrfs_backref_node *walk_up_backref(
194  		struct btrfs_backref_node *node,
195  		struct btrfs_backref_edge *edges[], int *index)
196  {
197  	struct btrfs_backref_edge *edge;
198  	int idx = *index;
199  
200  	while (!list_empty(&node->upper)) {
201  		edge = list_entry(node->upper.next,
202  				  struct btrfs_backref_edge, list[LOWER]);
203  		edges[idx++] = edge;
204  		node = edge->node[UPPER];
205  	}
206  	BUG_ON(node->detached);
207  	*index = idx;
208  	return node;
209  }
210  
211  /*
212   * walk down backref nodes to find start of next reference path
213   */
walk_down_backref(struct btrfs_backref_edge * edges[],int * index)214  static struct btrfs_backref_node *walk_down_backref(
215  		struct btrfs_backref_edge *edges[], int *index)
216  {
217  	struct btrfs_backref_edge *edge;
218  	struct btrfs_backref_node *lower;
219  	int idx = *index;
220  
221  	while (idx > 0) {
222  		edge = edges[idx - 1];
223  		lower = edge->node[LOWER];
224  		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
225  			idx--;
226  			continue;
227  		}
228  		edge = list_entry(edge->list[LOWER].next,
229  				  struct btrfs_backref_edge, list[LOWER]);
230  		edges[idx - 1] = edge;
231  		*index = idx;
232  		return edge->node[UPPER];
233  	}
234  	*index = 0;
235  	return NULL;
236  }
237  
update_backref_node(struct btrfs_backref_cache * cache,struct btrfs_backref_node * node,u64 bytenr)238  static void update_backref_node(struct btrfs_backref_cache *cache,
239  				struct btrfs_backref_node *node, u64 bytenr)
240  {
241  	struct rb_node *rb_node;
242  	rb_erase(&node->rb_node, &cache->rb_root);
243  	node->bytenr = bytenr;
244  	rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
245  	if (rb_node)
246  		btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
247  }
248  
249  /*
250   * update backref cache after a transaction commit
251   */
update_backref_cache(struct btrfs_trans_handle * trans,struct btrfs_backref_cache * cache)252  static int update_backref_cache(struct btrfs_trans_handle *trans,
253  				struct btrfs_backref_cache *cache)
254  {
255  	struct btrfs_backref_node *node;
256  	int level = 0;
257  
258  	if (cache->last_trans == 0) {
259  		cache->last_trans = trans->transid;
260  		return 0;
261  	}
262  
263  	if (cache->last_trans == trans->transid)
264  		return 0;
265  
266  	/*
267  	 * detached nodes are used to avoid unnecessary backref
268  	 * lookup. transaction commit changes the extent tree.
269  	 * so the detached nodes are no longer useful.
270  	 */
271  	while (!list_empty(&cache->detached)) {
272  		node = list_entry(cache->detached.next,
273  				  struct btrfs_backref_node, list);
274  		btrfs_backref_cleanup_node(cache, node);
275  	}
276  
277  	while (!list_empty(&cache->changed)) {
278  		node = list_entry(cache->changed.next,
279  				  struct btrfs_backref_node, list);
280  		list_del_init(&node->list);
281  		BUG_ON(node->pending);
282  		update_backref_node(cache, node, node->new_bytenr);
283  	}
284  
285  	/*
286  	 * some nodes can be left in the pending list if there were
287  	 * errors during processing the pending nodes.
288  	 */
289  	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
290  		list_for_each_entry(node, &cache->pending[level], list) {
291  			BUG_ON(!node->pending);
292  			if (node->bytenr == node->new_bytenr)
293  				continue;
294  			update_backref_node(cache, node, node->new_bytenr);
295  		}
296  	}
297  
298  	cache->last_trans = 0;
299  	return 1;
300  }
301  
reloc_root_is_dead(struct btrfs_root * root)302  static bool reloc_root_is_dead(struct btrfs_root *root)
303  {
304  	/*
305  	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
306  	 * btrfs_update_reloc_root. We need to see the updated bit before
307  	 * trying to access reloc_root
308  	 */
309  	smp_rmb();
310  	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
311  		return true;
312  	return false;
313  }
314  
315  /*
316   * Check if this subvolume tree has valid reloc tree.
317   *
318   * Reloc tree after swap is considered dead, thus not considered as valid.
319   * This is enough for most callers, as they don't distinguish dead reloc root
320   * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
321   * special case.
322   */
have_reloc_root(struct btrfs_root * root)323  static bool have_reloc_root(struct btrfs_root *root)
324  {
325  	if (reloc_root_is_dead(root))
326  		return false;
327  	if (!root->reloc_root)
328  		return false;
329  	return true;
330  }
331  
btrfs_should_ignore_reloc_root(struct btrfs_root * root)332  int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
333  {
334  	struct btrfs_root *reloc_root;
335  
336  	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
337  		return 0;
338  
339  	/* This root has been merged with its reloc tree, we can ignore it */
340  	if (reloc_root_is_dead(root))
341  		return 1;
342  
343  	reloc_root = root->reloc_root;
344  	if (!reloc_root)
345  		return 0;
346  
347  	if (btrfs_header_generation(reloc_root->commit_root) ==
348  	    root->fs_info->running_transaction->transid)
349  		return 0;
350  	/*
351  	 * if there is reloc tree and it was created in previous
352  	 * transaction backref lookup can find the reloc tree,
353  	 * so backref node for the fs tree root is useless for
354  	 * relocation.
355  	 */
356  	return 1;
357  }
358  
359  /*
360   * find reloc tree by address of tree root
361   */
find_reloc_root(struct btrfs_fs_info * fs_info,u64 bytenr)362  struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
363  {
364  	struct reloc_control *rc = fs_info->reloc_ctl;
365  	struct rb_node *rb_node;
366  	struct mapping_node *node;
367  	struct btrfs_root *root = NULL;
368  
369  	ASSERT(rc);
370  	spin_lock(&rc->reloc_root_tree.lock);
371  	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
372  	if (rb_node) {
373  		node = rb_entry(rb_node, struct mapping_node, rb_node);
374  		root = node->data;
375  	}
376  	spin_unlock(&rc->reloc_root_tree.lock);
377  	return btrfs_grab_root(root);
378  }
379  
380  /*
381   * For useless nodes, do two major clean ups:
382   *
383   * - Cleanup the children edges and nodes
384   *   If child node is also orphan (no parent) during cleanup, then the child
385   *   node will also be cleaned up.
386   *
387   * - Freeing up leaves (level 0), keeps nodes detached
388   *   For nodes, the node is still cached as "detached"
389   *
390   * Return false if @node is not in the @useless_nodes list.
391   * Return true if @node is in the @useless_nodes list.
392   */
handle_useless_nodes(struct reloc_control * rc,struct btrfs_backref_node * node)393  static bool handle_useless_nodes(struct reloc_control *rc,
394  				 struct btrfs_backref_node *node)
395  {
396  	struct btrfs_backref_cache *cache = &rc->backref_cache;
397  	struct list_head *useless_node = &cache->useless_node;
398  	bool ret = false;
399  
400  	while (!list_empty(useless_node)) {
401  		struct btrfs_backref_node *cur;
402  
403  		cur = list_first_entry(useless_node, struct btrfs_backref_node,
404  				 list);
405  		list_del_init(&cur->list);
406  
407  		/* Only tree root nodes can be added to @useless_nodes */
408  		ASSERT(list_empty(&cur->upper));
409  
410  		if (cur == node)
411  			ret = true;
412  
413  		/* The node is the lowest node */
414  		if (cur->lowest) {
415  			list_del_init(&cur->lower);
416  			cur->lowest = 0;
417  		}
418  
419  		/* Cleanup the lower edges */
420  		while (!list_empty(&cur->lower)) {
421  			struct btrfs_backref_edge *edge;
422  			struct btrfs_backref_node *lower;
423  
424  			edge = list_entry(cur->lower.next,
425  					struct btrfs_backref_edge, list[UPPER]);
426  			list_del(&edge->list[UPPER]);
427  			list_del(&edge->list[LOWER]);
428  			lower = edge->node[LOWER];
429  			btrfs_backref_free_edge(cache, edge);
430  
431  			/* Child node is also orphan, queue for cleanup */
432  			if (list_empty(&lower->upper))
433  				list_add(&lower->list, useless_node);
434  		}
435  		/* Mark this block processed for relocation */
436  		mark_block_processed(rc, cur);
437  
438  		/*
439  		 * Backref nodes for tree leaves are deleted from the cache.
440  		 * Backref nodes for upper level tree blocks are left in the
441  		 * cache to avoid unnecessary backref lookup.
442  		 */
443  		if (cur->level > 0) {
444  			list_add(&cur->list, &cache->detached);
445  			cur->detached = 1;
446  		} else {
447  			rb_erase(&cur->rb_node, &cache->rb_root);
448  			btrfs_backref_free_node(cache, cur);
449  		}
450  	}
451  	return ret;
452  }
453  
454  /*
455   * Build backref tree for a given tree block. Root of the backref tree
456   * corresponds the tree block, leaves of the backref tree correspond roots of
457   * b-trees that reference the tree block.
458   *
459   * The basic idea of this function is check backrefs of a given block to find
460   * upper level blocks that reference the block, and then check backrefs of
461   * these upper level blocks recursively. The recursion stops when tree root is
462   * reached or backrefs for the block is cached.
463   *
464   * NOTE: if we find that backrefs for a block are cached, we know backrefs for
465   * all upper level blocks that directly/indirectly reference the block are also
466   * cached.
467   */
build_backref_tree(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_key * node_key,int level,u64 bytenr)468  static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
469  			struct btrfs_trans_handle *trans,
470  			struct reloc_control *rc, struct btrfs_key *node_key,
471  			int level, u64 bytenr)
472  {
473  	struct btrfs_backref_iter *iter;
474  	struct btrfs_backref_cache *cache = &rc->backref_cache;
475  	/* For searching parent of TREE_BLOCK_REF */
476  	struct btrfs_path *path;
477  	struct btrfs_backref_node *cur;
478  	struct btrfs_backref_node *node = NULL;
479  	struct btrfs_backref_edge *edge;
480  	int ret;
481  	int err = 0;
482  
483  	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
484  	if (!iter)
485  		return ERR_PTR(-ENOMEM);
486  	path = btrfs_alloc_path();
487  	if (!path) {
488  		err = -ENOMEM;
489  		goto out;
490  	}
491  
492  	node = btrfs_backref_alloc_node(cache, bytenr, level);
493  	if (!node) {
494  		err = -ENOMEM;
495  		goto out;
496  	}
497  
498  	node->lowest = 1;
499  	cur = node;
500  
501  	/* Breadth-first search to build backref cache */
502  	do {
503  		ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
504  						  node_key, cur);
505  		if (ret < 0) {
506  			err = ret;
507  			goto out;
508  		}
509  		edge = list_first_entry_or_null(&cache->pending_edge,
510  				struct btrfs_backref_edge, list[UPPER]);
511  		/*
512  		 * The pending list isn't empty, take the first block to
513  		 * process
514  		 */
515  		if (edge) {
516  			list_del_init(&edge->list[UPPER]);
517  			cur = edge->node[UPPER];
518  		}
519  	} while (edge);
520  
521  	/* Finish the upper linkage of newly added edges/nodes */
522  	ret = btrfs_backref_finish_upper_links(cache, node);
523  	if (ret < 0) {
524  		err = ret;
525  		goto out;
526  	}
527  
528  	if (handle_useless_nodes(rc, node))
529  		node = NULL;
530  out:
531  	btrfs_backref_iter_free(iter);
532  	btrfs_free_path(path);
533  	if (err) {
534  		btrfs_backref_error_cleanup(cache, node);
535  		return ERR_PTR(err);
536  	}
537  	ASSERT(!node || !node->detached);
538  	ASSERT(list_empty(&cache->useless_node) &&
539  	       list_empty(&cache->pending_edge));
540  	return node;
541  }
542  
543  /*
544   * helper to add backref node for the newly created snapshot.
545   * the backref node is created by cloning backref node that
546   * corresponds to root of source tree
547   */
clone_backref_node(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * src,struct btrfs_root * dest)548  static int clone_backref_node(struct btrfs_trans_handle *trans,
549  			      struct reloc_control *rc,
550  			      struct btrfs_root *src,
551  			      struct btrfs_root *dest)
552  {
553  	struct btrfs_root *reloc_root = src->reloc_root;
554  	struct btrfs_backref_cache *cache = &rc->backref_cache;
555  	struct btrfs_backref_node *node = NULL;
556  	struct btrfs_backref_node *new_node;
557  	struct btrfs_backref_edge *edge;
558  	struct btrfs_backref_edge *new_edge;
559  	struct rb_node *rb_node;
560  
561  	if (cache->last_trans > 0)
562  		update_backref_cache(trans, cache);
563  
564  	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
565  	if (rb_node) {
566  		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
567  		if (node->detached)
568  			node = NULL;
569  		else
570  			BUG_ON(node->new_bytenr != reloc_root->node->start);
571  	}
572  
573  	if (!node) {
574  		rb_node = rb_simple_search(&cache->rb_root,
575  					   reloc_root->commit_root->start);
576  		if (rb_node) {
577  			node = rb_entry(rb_node, struct btrfs_backref_node,
578  					rb_node);
579  			BUG_ON(node->detached);
580  		}
581  	}
582  
583  	if (!node)
584  		return 0;
585  
586  	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
587  					    node->level);
588  	if (!new_node)
589  		return -ENOMEM;
590  
591  	new_node->lowest = node->lowest;
592  	new_node->checked = 1;
593  	new_node->root = btrfs_grab_root(dest);
594  	ASSERT(new_node->root);
595  
596  	if (!node->lowest) {
597  		list_for_each_entry(edge, &node->lower, list[UPPER]) {
598  			new_edge = btrfs_backref_alloc_edge(cache);
599  			if (!new_edge)
600  				goto fail;
601  
602  			btrfs_backref_link_edge(new_edge, edge->node[LOWER],
603  						new_node, LINK_UPPER);
604  		}
605  	} else {
606  		list_add_tail(&new_node->lower, &cache->leaves);
607  	}
608  
609  	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
610  				   &new_node->rb_node);
611  	if (rb_node)
612  		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
613  
614  	if (!new_node->lowest) {
615  		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
616  			list_add_tail(&new_edge->list[LOWER],
617  				      &new_edge->node[LOWER]->upper);
618  		}
619  	}
620  	return 0;
621  fail:
622  	while (!list_empty(&new_node->lower)) {
623  		new_edge = list_entry(new_node->lower.next,
624  				      struct btrfs_backref_edge, list[UPPER]);
625  		list_del(&new_edge->list[UPPER]);
626  		btrfs_backref_free_edge(cache, new_edge);
627  	}
628  	btrfs_backref_free_node(cache, new_node);
629  	return -ENOMEM;
630  }
631  
632  /*
633   * helper to add 'address of tree root -> reloc tree' mapping
634   */
__add_reloc_root(struct btrfs_root * root)635  static int __must_check __add_reloc_root(struct btrfs_root *root)
636  {
637  	struct btrfs_fs_info *fs_info = root->fs_info;
638  	struct rb_node *rb_node;
639  	struct mapping_node *node;
640  	struct reloc_control *rc = fs_info->reloc_ctl;
641  
642  	node = kmalloc(sizeof(*node), GFP_NOFS);
643  	if (!node)
644  		return -ENOMEM;
645  
646  	node->bytenr = root->commit_root->start;
647  	node->data = root;
648  
649  	spin_lock(&rc->reloc_root_tree.lock);
650  	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
651  				   node->bytenr, &node->rb_node);
652  	spin_unlock(&rc->reloc_root_tree.lock);
653  	if (rb_node) {
654  		btrfs_err(fs_info,
655  			    "Duplicate root found for start=%llu while inserting into relocation tree",
656  			    node->bytenr);
657  		return -EEXIST;
658  	}
659  
660  	list_add_tail(&root->root_list, &rc->reloc_roots);
661  	return 0;
662  }
663  
664  /*
665   * helper to delete the 'address of tree root -> reloc tree'
666   * mapping
667   */
__del_reloc_root(struct btrfs_root * root)668  static void __del_reloc_root(struct btrfs_root *root)
669  {
670  	struct btrfs_fs_info *fs_info = root->fs_info;
671  	struct rb_node *rb_node;
672  	struct mapping_node *node = NULL;
673  	struct reloc_control *rc = fs_info->reloc_ctl;
674  	bool put_ref = false;
675  
676  	if (rc && root->node) {
677  		spin_lock(&rc->reloc_root_tree.lock);
678  		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
679  					   root->commit_root->start);
680  		if (rb_node) {
681  			node = rb_entry(rb_node, struct mapping_node, rb_node);
682  			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
683  			RB_CLEAR_NODE(&node->rb_node);
684  		}
685  		spin_unlock(&rc->reloc_root_tree.lock);
686  		ASSERT(!node || (struct btrfs_root *)node->data == root);
687  	}
688  
689  	/*
690  	 * We only put the reloc root here if it's on the list.  There's a lot
691  	 * of places where the pattern is to splice the rc->reloc_roots, process
692  	 * the reloc roots, and then add the reloc root back onto
693  	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
694  	 * list we don't want the reference being dropped, because the guy
695  	 * messing with the list is in charge of the reference.
696  	 */
697  	spin_lock(&fs_info->trans_lock);
698  	if (!list_empty(&root->root_list)) {
699  		put_ref = true;
700  		list_del_init(&root->root_list);
701  	}
702  	spin_unlock(&fs_info->trans_lock);
703  	if (put_ref)
704  		btrfs_put_root(root);
705  	kfree(node);
706  }
707  
708  /*
709   * helper to update the 'address of tree root -> reloc tree'
710   * mapping
711   */
__update_reloc_root(struct btrfs_root * root)712  static int __update_reloc_root(struct btrfs_root *root)
713  {
714  	struct btrfs_fs_info *fs_info = root->fs_info;
715  	struct rb_node *rb_node;
716  	struct mapping_node *node = NULL;
717  	struct reloc_control *rc = fs_info->reloc_ctl;
718  
719  	spin_lock(&rc->reloc_root_tree.lock);
720  	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
721  				   root->commit_root->start);
722  	if (rb_node) {
723  		node = rb_entry(rb_node, struct mapping_node, rb_node);
724  		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
725  	}
726  	spin_unlock(&rc->reloc_root_tree.lock);
727  
728  	if (!node)
729  		return 0;
730  	BUG_ON((struct btrfs_root *)node->data != root);
731  
732  	spin_lock(&rc->reloc_root_tree.lock);
733  	node->bytenr = root->node->start;
734  	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
735  				   node->bytenr, &node->rb_node);
736  	spin_unlock(&rc->reloc_root_tree.lock);
737  	if (rb_node)
738  		btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
739  	return 0;
740  }
741  
create_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)742  static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
743  					struct btrfs_root *root, u64 objectid)
744  {
745  	struct btrfs_fs_info *fs_info = root->fs_info;
746  	struct btrfs_root *reloc_root;
747  	struct extent_buffer *eb;
748  	struct btrfs_root_item *root_item;
749  	struct btrfs_key root_key;
750  	int ret = 0;
751  	bool must_abort = false;
752  
753  	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
754  	if (!root_item)
755  		return ERR_PTR(-ENOMEM);
756  
757  	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
758  	root_key.type = BTRFS_ROOT_ITEM_KEY;
759  	root_key.offset = objectid;
760  
761  	if (root->root_key.objectid == objectid) {
762  		u64 commit_root_gen;
763  
764  		/* called by btrfs_init_reloc_root */
765  		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
766  				      BTRFS_TREE_RELOC_OBJECTID);
767  		if (ret)
768  			goto fail;
769  
770  		/*
771  		 * Set the last_snapshot field to the generation of the commit
772  		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
773  		 * correctly (returns true) when the relocation root is created
774  		 * either inside the critical section of a transaction commit
775  		 * (through transaction.c:qgroup_account_snapshot()) and when
776  		 * it's created before the transaction commit is started.
777  		 */
778  		commit_root_gen = btrfs_header_generation(root->commit_root);
779  		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
780  	} else {
781  		/*
782  		 * called by btrfs_reloc_post_snapshot_hook.
783  		 * the source tree is a reloc tree, all tree blocks
784  		 * modified after it was created have RELOC flag
785  		 * set in their headers. so it's OK to not update
786  		 * the 'last_snapshot'.
787  		 */
788  		ret = btrfs_copy_root(trans, root, root->node, &eb,
789  				      BTRFS_TREE_RELOC_OBJECTID);
790  		if (ret)
791  			goto fail;
792  	}
793  
794  	/*
795  	 * We have changed references at this point, we must abort the
796  	 * transaction if anything fails.
797  	 */
798  	must_abort = true;
799  
800  	memcpy(root_item, &root->root_item, sizeof(*root_item));
801  	btrfs_set_root_bytenr(root_item, eb->start);
802  	btrfs_set_root_level(root_item, btrfs_header_level(eb));
803  	btrfs_set_root_generation(root_item, trans->transid);
804  
805  	if (root->root_key.objectid == objectid) {
806  		btrfs_set_root_refs(root_item, 0);
807  		memset(&root_item->drop_progress, 0,
808  		       sizeof(struct btrfs_disk_key));
809  		btrfs_set_root_drop_level(root_item, 0);
810  	}
811  
812  	btrfs_tree_unlock(eb);
813  	free_extent_buffer(eb);
814  
815  	ret = btrfs_insert_root(trans, fs_info->tree_root,
816  				&root_key, root_item);
817  	if (ret)
818  		goto fail;
819  
820  	kfree(root_item);
821  
822  	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
823  	if (IS_ERR(reloc_root)) {
824  		ret = PTR_ERR(reloc_root);
825  		goto abort;
826  	}
827  	set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
828  	reloc_root->last_trans = trans->transid;
829  	return reloc_root;
830  fail:
831  	kfree(root_item);
832  abort:
833  	if (must_abort)
834  		btrfs_abort_transaction(trans, ret);
835  	return ERR_PTR(ret);
836  }
837  
838  /*
839   * create reloc tree for a given fs tree. reloc tree is just a
840   * snapshot of the fs tree with special root objectid.
841   *
842   * The reloc_root comes out of here with two references, one for
843   * root->reloc_root, and another for being on the rc->reloc_roots list.
844   */
btrfs_init_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)845  int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
846  			  struct btrfs_root *root)
847  {
848  	struct btrfs_fs_info *fs_info = root->fs_info;
849  	struct btrfs_root *reloc_root;
850  	struct reloc_control *rc = fs_info->reloc_ctl;
851  	struct btrfs_block_rsv *rsv;
852  	int clear_rsv = 0;
853  	int ret;
854  
855  	if (!rc)
856  		return 0;
857  
858  	/*
859  	 * The subvolume has reloc tree but the swap is finished, no need to
860  	 * create/update the dead reloc tree
861  	 */
862  	if (reloc_root_is_dead(root))
863  		return 0;
864  
865  	/*
866  	 * This is subtle but important.  We do not do
867  	 * record_root_in_transaction for reloc roots, instead we record their
868  	 * corresponding fs root, and then here we update the last trans for the
869  	 * reloc root.  This means that we have to do this for the entire life
870  	 * of the reloc root, regardless of which stage of the relocation we are
871  	 * in.
872  	 */
873  	if (root->reloc_root) {
874  		reloc_root = root->reloc_root;
875  		reloc_root->last_trans = trans->transid;
876  		return 0;
877  	}
878  
879  	/*
880  	 * We are merging reloc roots, we do not need new reloc trees.  Also
881  	 * reloc trees never need their own reloc tree.
882  	 */
883  	if (!rc->create_reloc_tree ||
884  	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
885  		return 0;
886  
887  	if (!trans->reloc_reserved) {
888  		rsv = trans->block_rsv;
889  		trans->block_rsv = rc->block_rsv;
890  		clear_rsv = 1;
891  	}
892  	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
893  	if (clear_rsv)
894  		trans->block_rsv = rsv;
895  	if (IS_ERR(reloc_root))
896  		return PTR_ERR(reloc_root);
897  
898  	ret = __add_reloc_root(reloc_root);
899  	ASSERT(ret != -EEXIST);
900  	if (ret) {
901  		/* Pairs with create_reloc_root */
902  		btrfs_put_root(reloc_root);
903  		return ret;
904  	}
905  	root->reloc_root = btrfs_grab_root(reloc_root);
906  	return 0;
907  }
908  
909  /*
910   * update root item of reloc tree
911   */
btrfs_update_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)912  int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
913  			    struct btrfs_root *root)
914  {
915  	struct btrfs_fs_info *fs_info = root->fs_info;
916  	struct btrfs_root *reloc_root;
917  	struct btrfs_root_item *root_item;
918  	int ret;
919  
920  	if (!have_reloc_root(root))
921  		return 0;
922  
923  	reloc_root = root->reloc_root;
924  	root_item = &reloc_root->root_item;
925  
926  	/*
927  	 * We are probably ok here, but __del_reloc_root() will drop its ref of
928  	 * the root.  We have the ref for root->reloc_root, but just in case
929  	 * hold it while we update the reloc root.
930  	 */
931  	btrfs_grab_root(reloc_root);
932  
933  	/* root->reloc_root will stay until current relocation finished */
934  	if (fs_info->reloc_ctl->merge_reloc_tree &&
935  	    btrfs_root_refs(root_item) == 0) {
936  		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
937  		/*
938  		 * Mark the tree as dead before we change reloc_root so
939  		 * have_reloc_root will not touch it from now on.
940  		 */
941  		smp_wmb();
942  		__del_reloc_root(reloc_root);
943  	}
944  
945  	if (reloc_root->commit_root != reloc_root->node) {
946  		__update_reloc_root(reloc_root);
947  		btrfs_set_root_node(root_item, reloc_root->node);
948  		free_extent_buffer(reloc_root->commit_root);
949  		reloc_root->commit_root = btrfs_root_node(reloc_root);
950  	}
951  
952  	ret = btrfs_update_root(trans, fs_info->tree_root,
953  				&reloc_root->root_key, root_item);
954  	btrfs_put_root(reloc_root);
955  	return ret;
956  }
957  
958  /*
959   * helper to find first cached inode with inode number >= objectid
960   * in a subvolume
961   */
find_next_inode(struct btrfs_root * root,u64 objectid)962  static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
963  {
964  	struct rb_node *node;
965  	struct rb_node *prev;
966  	struct btrfs_inode *entry;
967  	struct inode *inode;
968  
969  	spin_lock(&root->inode_lock);
970  again:
971  	node = root->inode_tree.rb_node;
972  	prev = NULL;
973  	while (node) {
974  		prev = node;
975  		entry = rb_entry(node, struct btrfs_inode, rb_node);
976  
977  		if (objectid < btrfs_ino(entry))
978  			node = node->rb_left;
979  		else if (objectid > btrfs_ino(entry))
980  			node = node->rb_right;
981  		else
982  			break;
983  	}
984  	if (!node) {
985  		while (prev) {
986  			entry = rb_entry(prev, struct btrfs_inode, rb_node);
987  			if (objectid <= btrfs_ino(entry)) {
988  				node = prev;
989  				break;
990  			}
991  			prev = rb_next(prev);
992  		}
993  	}
994  	while (node) {
995  		entry = rb_entry(node, struct btrfs_inode, rb_node);
996  		inode = igrab(&entry->vfs_inode);
997  		if (inode) {
998  			spin_unlock(&root->inode_lock);
999  			return inode;
1000  		}
1001  
1002  		objectid = btrfs_ino(entry) + 1;
1003  		if (cond_resched_lock(&root->inode_lock))
1004  			goto again;
1005  
1006  		node = rb_next(node);
1007  	}
1008  	spin_unlock(&root->inode_lock);
1009  	return NULL;
1010  }
1011  
1012  /*
1013   * get new location of data
1014   */
get_new_location(struct inode * reloc_inode,u64 * new_bytenr,u64 bytenr,u64 num_bytes)1015  static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1016  			    u64 bytenr, u64 num_bytes)
1017  {
1018  	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1019  	struct btrfs_path *path;
1020  	struct btrfs_file_extent_item *fi;
1021  	struct extent_buffer *leaf;
1022  	int ret;
1023  
1024  	path = btrfs_alloc_path();
1025  	if (!path)
1026  		return -ENOMEM;
1027  
1028  	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1029  	ret = btrfs_lookup_file_extent(NULL, root, path,
1030  			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1031  	if (ret < 0)
1032  		goto out;
1033  	if (ret > 0) {
1034  		ret = -ENOENT;
1035  		goto out;
1036  	}
1037  
1038  	leaf = path->nodes[0];
1039  	fi = btrfs_item_ptr(leaf, path->slots[0],
1040  			    struct btrfs_file_extent_item);
1041  
1042  	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1043  	       btrfs_file_extent_compression(leaf, fi) ||
1044  	       btrfs_file_extent_encryption(leaf, fi) ||
1045  	       btrfs_file_extent_other_encoding(leaf, fi));
1046  
1047  	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1048  		ret = -EINVAL;
1049  		goto out;
1050  	}
1051  
1052  	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1053  	ret = 0;
1054  out:
1055  	btrfs_free_path(path);
1056  	return ret;
1057  }
1058  
1059  /*
1060   * update file extent items in the tree leaf to point to
1061   * the new locations.
1062   */
1063  static noinline_for_stack
replace_file_extents(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root,struct extent_buffer * leaf)1064  int replace_file_extents(struct btrfs_trans_handle *trans,
1065  			 struct reloc_control *rc,
1066  			 struct btrfs_root *root,
1067  			 struct extent_buffer *leaf)
1068  {
1069  	struct btrfs_fs_info *fs_info = root->fs_info;
1070  	struct btrfs_key key;
1071  	struct btrfs_file_extent_item *fi;
1072  	struct inode *inode = NULL;
1073  	u64 parent;
1074  	u64 bytenr;
1075  	u64 new_bytenr = 0;
1076  	u64 num_bytes;
1077  	u64 end;
1078  	u32 nritems;
1079  	u32 i;
1080  	int ret = 0;
1081  	int first = 1;
1082  	int dirty = 0;
1083  
1084  	if (rc->stage != UPDATE_DATA_PTRS)
1085  		return 0;
1086  
1087  	/* reloc trees always use full backref */
1088  	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1089  		parent = leaf->start;
1090  	else
1091  		parent = 0;
1092  
1093  	nritems = btrfs_header_nritems(leaf);
1094  	for (i = 0; i < nritems; i++) {
1095  		struct btrfs_ref ref = { 0 };
1096  
1097  		cond_resched();
1098  		btrfs_item_key_to_cpu(leaf, &key, i);
1099  		if (key.type != BTRFS_EXTENT_DATA_KEY)
1100  			continue;
1101  		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1102  		if (btrfs_file_extent_type(leaf, fi) ==
1103  		    BTRFS_FILE_EXTENT_INLINE)
1104  			continue;
1105  		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106  		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1107  		if (bytenr == 0)
1108  			continue;
1109  		if (!in_range(bytenr, rc->block_group->start,
1110  			      rc->block_group->length))
1111  			continue;
1112  
1113  		/*
1114  		 * if we are modifying block in fs tree, wait for read_folio
1115  		 * to complete and drop the extent cache
1116  		 */
1117  		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1118  			if (first) {
1119  				inode = find_next_inode(root, key.objectid);
1120  				first = 0;
1121  			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1122  				btrfs_add_delayed_iput(BTRFS_I(inode));
1123  				inode = find_next_inode(root, key.objectid);
1124  			}
1125  			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1126  				struct extent_state *cached_state = NULL;
1127  
1128  				end = key.offset +
1129  				      btrfs_file_extent_num_bytes(leaf, fi);
1130  				WARN_ON(!IS_ALIGNED(key.offset,
1131  						    fs_info->sectorsize));
1132  				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1133  				end--;
1134  				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1135  						      key.offset, end,
1136  						      &cached_state);
1137  				if (!ret)
1138  					continue;
1139  
1140  				btrfs_drop_extent_map_range(BTRFS_I(inode),
1141  							    key.offset, end, true);
1142  				unlock_extent(&BTRFS_I(inode)->io_tree,
1143  					      key.offset, end, &cached_state);
1144  			}
1145  		}
1146  
1147  		ret = get_new_location(rc->data_inode, &new_bytenr,
1148  				       bytenr, num_bytes);
1149  		if (ret) {
1150  			/*
1151  			 * Don't have to abort since we've not changed anything
1152  			 * in the file extent yet.
1153  			 */
1154  			break;
1155  		}
1156  
1157  		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1158  		dirty = 1;
1159  
1160  		key.offset -= btrfs_file_extent_offset(leaf, fi);
1161  		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1162  				       num_bytes, parent);
1163  		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1164  				    key.objectid, key.offset,
1165  				    root->root_key.objectid, false);
1166  		ret = btrfs_inc_extent_ref(trans, &ref);
1167  		if (ret) {
1168  			btrfs_abort_transaction(trans, ret);
1169  			break;
1170  		}
1171  
1172  		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1173  				       num_bytes, parent);
1174  		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1175  				    key.objectid, key.offset,
1176  				    root->root_key.objectid, false);
1177  		ret = btrfs_free_extent(trans, &ref);
1178  		if (ret) {
1179  			btrfs_abort_transaction(trans, ret);
1180  			break;
1181  		}
1182  	}
1183  	if (dirty)
1184  		btrfs_mark_buffer_dirty(leaf);
1185  	if (inode)
1186  		btrfs_add_delayed_iput(BTRFS_I(inode));
1187  	return ret;
1188  }
1189  
1190  static noinline_for_stack
memcmp_node_keys(struct extent_buffer * eb,int slot,struct btrfs_path * path,int level)1191  int memcmp_node_keys(struct extent_buffer *eb, int slot,
1192  		     struct btrfs_path *path, int level)
1193  {
1194  	struct btrfs_disk_key key1;
1195  	struct btrfs_disk_key key2;
1196  	btrfs_node_key(eb, &key1, slot);
1197  	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1198  	return memcmp(&key1, &key2, sizeof(key1));
1199  }
1200  
1201  /*
1202   * try to replace tree blocks in fs tree with the new blocks
1203   * in reloc tree. tree blocks haven't been modified since the
1204   * reloc tree was create can be replaced.
1205   *
1206   * if a block was replaced, level of the block + 1 is returned.
1207   * if no block got replaced, 0 is returned. if there are other
1208   * errors, a negative error number is returned.
1209   */
1210  static noinline_for_stack
replace_path(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * dest,struct btrfs_root * src,struct btrfs_path * path,struct btrfs_key * next_key,int lowest_level,int max_level)1211  int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1212  		 struct btrfs_root *dest, struct btrfs_root *src,
1213  		 struct btrfs_path *path, struct btrfs_key *next_key,
1214  		 int lowest_level, int max_level)
1215  {
1216  	struct btrfs_fs_info *fs_info = dest->fs_info;
1217  	struct extent_buffer *eb;
1218  	struct extent_buffer *parent;
1219  	struct btrfs_ref ref = { 0 };
1220  	struct btrfs_key key;
1221  	u64 old_bytenr;
1222  	u64 new_bytenr;
1223  	u64 old_ptr_gen;
1224  	u64 new_ptr_gen;
1225  	u64 last_snapshot;
1226  	u32 blocksize;
1227  	int cow = 0;
1228  	int level;
1229  	int ret;
1230  	int slot;
1231  
1232  	ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1233  	ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1234  
1235  	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1236  again:
1237  	slot = path->slots[lowest_level];
1238  	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1239  
1240  	eb = btrfs_lock_root_node(dest);
1241  	level = btrfs_header_level(eb);
1242  
1243  	if (level < lowest_level) {
1244  		btrfs_tree_unlock(eb);
1245  		free_extent_buffer(eb);
1246  		return 0;
1247  	}
1248  
1249  	if (cow) {
1250  		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1251  				      BTRFS_NESTING_COW);
1252  		if (ret) {
1253  			btrfs_tree_unlock(eb);
1254  			free_extent_buffer(eb);
1255  			return ret;
1256  		}
1257  	}
1258  
1259  	if (next_key) {
1260  		next_key->objectid = (u64)-1;
1261  		next_key->type = (u8)-1;
1262  		next_key->offset = (u64)-1;
1263  	}
1264  
1265  	parent = eb;
1266  	while (1) {
1267  		level = btrfs_header_level(parent);
1268  		ASSERT(level >= lowest_level);
1269  
1270  		ret = btrfs_bin_search(parent, 0, &key, &slot);
1271  		if (ret < 0)
1272  			break;
1273  		if (ret && slot > 0)
1274  			slot--;
1275  
1276  		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1277  			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1278  
1279  		old_bytenr = btrfs_node_blockptr(parent, slot);
1280  		blocksize = fs_info->nodesize;
1281  		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1282  
1283  		if (level <= max_level) {
1284  			eb = path->nodes[level];
1285  			new_bytenr = btrfs_node_blockptr(eb,
1286  							path->slots[level]);
1287  			new_ptr_gen = btrfs_node_ptr_generation(eb,
1288  							path->slots[level]);
1289  		} else {
1290  			new_bytenr = 0;
1291  			new_ptr_gen = 0;
1292  		}
1293  
1294  		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1295  			ret = level;
1296  			break;
1297  		}
1298  
1299  		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1300  		    memcmp_node_keys(parent, slot, path, level)) {
1301  			if (level <= lowest_level) {
1302  				ret = 0;
1303  				break;
1304  			}
1305  
1306  			eb = btrfs_read_node_slot(parent, slot);
1307  			if (IS_ERR(eb)) {
1308  				ret = PTR_ERR(eb);
1309  				break;
1310  			}
1311  			btrfs_tree_lock(eb);
1312  			if (cow) {
1313  				ret = btrfs_cow_block(trans, dest, eb, parent,
1314  						      slot, &eb,
1315  						      BTRFS_NESTING_COW);
1316  				if (ret) {
1317  					btrfs_tree_unlock(eb);
1318  					free_extent_buffer(eb);
1319  					break;
1320  				}
1321  			}
1322  
1323  			btrfs_tree_unlock(parent);
1324  			free_extent_buffer(parent);
1325  
1326  			parent = eb;
1327  			continue;
1328  		}
1329  
1330  		if (!cow) {
1331  			btrfs_tree_unlock(parent);
1332  			free_extent_buffer(parent);
1333  			cow = 1;
1334  			goto again;
1335  		}
1336  
1337  		btrfs_node_key_to_cpu(path->nodes[level], &key,
1338  				      path->slots[level]);
1339  		btrfs_release_path(path);
1340  
1341  		path->lowest_level = level;
1342  		set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1343  		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1344  		clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1345  		path->lowest_level = 0;
1346  		if (ret) {
1347  			if (ret > 0)
1348  				ret = -ENOENT;
1349  			break;
1350  		}
1351  
1352  		/*
1353  		 * Info qgroup to trace both subtrees.
1354  		 *
1355  		 * We must trace both trees.
1356  		 * 1) Tree reloc subtree
1357  		 *    If not traced, we will leak data numbers
1358  		 * 2) Fs subtree
1359  		 *    If not traced, we will double count old data
1360  		 *
1361  		 * We don't scan the subtree right now, but only record
1362  		 * the swapped tree blocks.
1363  		 * The real subtree rescan is delayed until we have new
1364  		 * CoW on the subtree root node before transaction commit.
1365  		 */
1366  		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1367  				rc->block_group, parent, slot,
1368  				path->nodes[level], path->slots[level],
1369  				last_snapshot);
1370  		if (ret < 0)
1371  			break;
1372  		/*
1373  		 * swap blocks in fs tree and reloc tree.
1374  		 */
1375  		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1376  		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1377  		btrfs_mark_buffer_dirty(parent);
1378  
1379  		btrfs_set_node_blockptr(path->nodes[level],
1380  					path->slots[level], old_bytenr);
1381  		btrfs_set_node_ptr_generation(path->nodes[level],
1382  					      path->slots[level], old_ptr_gen);
1383  		btrfs_mark_buffer_dirty(path->nodes[level]);
1384  
1385  		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1386  				       blocksize, path->nodes[level]->start);
1387  		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1388  				    0, true);
1389  		ret = btrfs_inc_extent_ref(trans, &ref);
1390  		if (ret) {
1391  			btrfs_abort_transaction(trans, ret);
1392  			break;
1393  		}
1394  		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1395  				       blocksize, 0);
1396  		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
1397  				    true);
1398  		ret = btrfs_inc_extent_ref(trans, &ref);
1399  		if (ret) {
1400  			btrfs_abort_transaction(trans, ret);
1401  			break;
1402  		}
1403  
1404  		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1405  				       blocksize, path->nodes[level]->start);
1406  		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1407  				    0, true);
1408  		ret = btrfs_free_extent(trans, &ref);
1409  		if (ret) {
1410  			btrfs_abort_transaction(trans, ret);
1411  			break;
1412  		}
1413  
1414  		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1415  				       blocksize, 0);
1416  		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
1417  				    0, true);
1418  		ret = btrfs_free_extent(trans, &ref);
1419  		if (ret) {
1420  			btrfs_abort_transaction(trans, ret);
1421  			break;
1422  		}
1423  
1424  		btrfs_unlock_up_safe(path, 0);
1425  
1426  		ret = level;
1427  		break;
1428  	}
1429  	btrfs_tree_unlock(parent);
1430  	free_extent_buffer(parent);
1431  	return ret;
1432  }
1433  
1434  /*
1435   * helper to find next relocated block in reloc tree
1436   */
1437  static noinline_for_stack
walk_up_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1438  int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1439  		       int *level)
1440  {
1441  	struct extent_buffer *eb;
1442  	int i;
1443  	u64 last_snapshot;
1444  	u32 nritems;
1445  
1446  	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1447  
1448  	for (i = 0; i < *level; i++) {
1449  		free_extent_buffer(path->nodes[i]);
1450  		path->nodes[i] = NULL;
1451  	}
1452  
1453  	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1454  		eb = path->nodes[i];
1455  		nritems = btrfs_header_nritems(eb);
1456  		while (path->slots[i] + 1 < nritems) {
1457  			path->slots[i]++;
1458  			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1459  			    last_snapshot)
1460  				continue;
1461  
1462  			*level = i;
1463  			return 0;
1464  		}
1465  		free_extent_buffer(path->nodes[i]);
1466  		path->nodes[i] = NULL;
1467  	}
1468  	return 1;
1469  }
1470  
1471  /*
1472   * walk down reloc tree to find relocated block of lowest level
1473   */
1474  static noinline_for_stack
walk_down_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1475  int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1476  			 int *level)
1477  {
1478  	struct extent_buffer *eb = NULL;
1479  	int i;
1480  	u64 ptr_gen = 0;
1481  	u64 last_snapshot;
1482  	u32 nritems;
1483  
1484  	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1485  
1486  	for (i = *level; i > 0; i--) {
1487  		eb = path->nodes[i];
1488  		nritems = btrfs_header_nritems(eb);
1489  		while (path->slots[i] < nritems) {
1490  			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1491  			if (ptr_gen > last_snapshot)
1492  				break;
1493  			path->slots[i]++;
1494  		}
1495  		if (path->slots[i] >= nritems) {
1496  			if (i == *level)
1497  				break;
1498  			*level = i + 1;
1499  			return 0;
1500  		}
1501  		if (i == 1) {
1502  			*level = i;
1503  			return 0;
1504  		}
1505  
1506  		eb = btrfs_read_node_slot(eb, path->slots[i]);
1507  		if (IS_ERR(eb))
1508  			return PTR_ERR(eb);
1509  		BUG_ON(btrfs_header_level(eb) != i - 1);
1510  		path->nodes[i - 1] = eb;
1511  		path->slots[i - 1] = 0;
1512  	}
1513  	return 1;
1514  }
1515  
1516  /*
1517   * invalidate extent cache for file extents whose key in range of
1518   * [min_key, max_key)
1519   */
invalidate_extent_cache(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_key * max_key)1520  static int invalidate_extent_cache(struct btrfs_root *root,
1521  				   struct btrfs_key *min_key,
1522  				   struct btrfs_key *max_key)
1523  {
1524  	struct btrfs_fs_info *fs_info = root->fs_info;
1525  	struct inode *inode = NULL;
1526  	u64 objectid;
1527  	u64 start, end;
1528  	u64 ino;
1529  
1530  	objectid = min_key->objectid;
1531  	while (1) {
1532  		struct extent_state *cached_state = NULL;
1533  
1534  		cond_resched();
1535  		iput(inode);
1536  
1537  		if (objectid > max_key->objectid)
1538  			break;
1539  
1540  		inode = find_next_inode(root, objectid);
1541  		if (!inode)
1542  			break;
1543  		ino = btrfs_ino(BTRFS_I(inode));
1544  
1545  		if (ino > max_key->objectid) {
1546  			iput(inode);
1547  			break;
1548  		}
1549  
1550  		objectid = ino + 1;
1551  		if (!S_ISREG(inode->i_mode))
1552  			continue;
1553  
1554  		if (unlikely(min_key->objectid == ino)) {
1555  			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1556  				continue;
1557  			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1558  				start = 0;
1559  			else {
1560  				start = min_key->offset;
1561  				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1562  			}
1563  		} else {
1564  			start = 0;
1565  		}
1566  
1567  		if (unlikely(max_key->objectid == ino)) {
1568  			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1569  				continue;
1570  			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1571  				end = (u64)-1;
1572  			} else {
1573  				if (max_key->offset == 0)
1574  					continue;
1575  				end = max_key->offset;
1576  				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1577  				end--;
1578  			}
1579  		} else {
1580  			end = (u64)-1;
1581  		}
1582  
1583  		/* the lock_extent waits for read_folio to complete */
1584  		lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1585  		btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
1586  		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1587  	}
1588  	return 0;
1589  }
1590  
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)1591  static int find_next_key(struct btrfs_path *path, int level,
1592  			 struct btrfs_key *key)
1593  
1594  {
1595  	while (level < BTRFS_MAX_LEVEL) {
1596  		if (!path->nodes[level])
1597  			break;
1598  		if (path->slots[level] + 1 <
1599  		    btrfs_header_nritems(path->nodes[level])) {
1600  			btrfs_node_key_to_cpu(path->nodes[level], key,
1601  					      path->slots[level] + 1);
1602  			return 0;
1603  		}
1604  		level++;
1605  	}
1606  	return 1;
1607  }
1608  
1609  /*
1610   * Insert current subvolume into reloc_control::dirty_subvol_roots
1611   */
insert_dirty_subvol(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root)1612  static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
1613  			       struct reloc_control *rc,
1614  			       struct btrfs_root *root)
1615  {
1616  	struct btrfs_root *reloc_root = root->reloc_root;
1617  	struct btrfs_root_item *reloc_root_item;
1618  	int ret;
1619  
1620  	/* @root must be a subvolume tree root with a valid reloc tree */
1621  	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1622  	ASSERT(reloc_root);
1623  
1624  	reloc_root_item = &reloc_root->root_item;
1625  	memset(&reloc_root_item->drop_progress, 0,
1626  		sizeof(reloc_root_item->drop_progress));
1627  	btrfs_set_root_drop_level(reloc_root_item, 0);
1628  	btrfs_set_root_refs(reloc_root_item, 0);
1629  	ret = btrfs_update_reloc_root(trans, root);
1630  	if (ret)
1631  		return ret;
1632  
1633  	if (list_empty(&root->reloc_dirty_list)) {
1634  		btrfs_grab_root(root);
1635  		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1636  	}
1637  
1638  	return 0;
1639  }
1640  
clean_dirty_subvols(struct reloc_control * rc)1641  static int clean_dirty_subvols(struct reloc_control *rc)
1642  {
1643  	struct btrfs_root *root;
1644  	struct btrfs_root *next;
1645  	int ret = 0;
1646  	int ret2;
1647  
1648  	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1649  				 reloc_dirty_list) {
1650  		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1651  			/* Merged subvolume, cleanup its reloc root */
1652  			struct btrfs_root *reloc_root = root->reloc_root;
1653  
1654  			list_del_init(&root->reloc_dirty_list);
1655  			root->reloc_root = NULL;
1656  			/*
1657  			 * Need barrier to ensure clear_bit() only happens after
1658  			 * root->reloc_root = NULL. Pairs with have_reloc_root.
1659  			 */
1660  			smp_wmb();
1661  			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1662  			if (reloc_root) {
1663  				/*
1664  				 * btrfs_drop_snapshot drops our ref we hold for
1665  				 * ->reloc_root.  If it fails however we must
1666  				 * drop the ref ourselves.
1667  				 */
1668  				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1669  				if (ret2 < 0) {
1670  					btrfs_put_root(reloc_root);
1671  					if (!ret)
1672  						ret = ret2;
1673  				}
1674  			}
1675  			btrfs_put_root(root);
1676  		} else {
1677  			/* Orphan reloc tree, just clean it up */
1678  			ret2 = btrfs_drop_snapshot(root, 0, 1);
1679  			if (ret2 < 0) {
1680  				btrfs_put_root(root);
1681  				if (!ret)
1682  					ret = ret2;
1683  			}
1684  		}
1685  	}
1686  	return ret;
1687  }
1688  
1689  /*
1690   * merge the relocated tree blocks in reloc tree with corresponding
1691   * fs tree.
1692   */
merge_reloc_root(struct reloc_control * rc,struct btrfs_root * root)1693  static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1694  					       struct btrfs_root *root)
1695  {
1696  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1697  	struct btrfs_key key;
1698  	struct btrfs_key next_key;
1699  	struct btrfs_trans_handle *trans = NULL;
1700  	struct btrfs_root *reloc_root;
1701  	struct btrfs_root_item *root_item;
1702  	struct btrfs_path *path;
1703  	struct extent_buffer *leaf;
1704  	int reserve_level;
1705  	int level;
1706  	int max_level;
1707  	int replaced = 0;
1708  	int ret = 0;
1709  	u32 min_reserved;
1710  
1711  	path = btrfs_alloc_path();
1712  	if (!path)
1713  		return -ENOMEM;
1714  	path->reada = READA_FORWARD;
1715  
1716  	reloc_root = root->reloc_root;
1717  	root_item = &reloc_root->root_item;
1718  
1719  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1720  		level = btrfs_root_level(root_item);
1721  		atomic_inc(&reloc_root->node->refs);
1722  		path->nodes[level] = reloc_root->node;
1723  		path->slots[level] = 0;
1724  	} else {
1725  		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1726  
1727  		level = btrfs_root_drop_level(root_item);
1728  		BUG_ON(level == 0);
1729  		path->lowest_level = level;
1730  		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1731  		path->lowest_level = 0;
1732  		if (ret < 0) {
1733  			btrfs_free_path(path);
1734  			return ret;
1735  		}
1736  
1737  		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1738  				      path->slots[level]);
1739  		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1740  
1741  		btrfs_unlock_up_safe(path, 0);
1742  	}
1743  
1744  	/*
1745  	 * In merge_reloc_root(), we modify the upper level pointer to swap the
1746  	 * tree blocks between reloc tree and subvolume tree.  Thus for tree
1747  	 * block COW, we COW at most from level 1 to root level for each tree.
1748  	 *
1749  	 * Thus the needed metadata size is at most root_level * nodesize,
1750  	 * and * 2 since we have two trees to COW.
1751  	 */
1752  	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1753  	min_reserved = fs_info->nodesize * reserve_level * 2;
1754  	memset(&next_key, 0, sizeof(next_key));
1755  
1756  	while (1) {
1757  		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
1758  					     min_reserved,
1759  					     BTRFS_RESERVE_FLUSH_LIMIT);
1760  		if (ret)
1761  			goto out;
1762  		trans = btrfs_start_transaction(root, 0);
1763  		if (IS_ERR(trans)) {
1764  			ret = PTR_ERR(trans);
1765  			trans = NULL;
1766  			goto out;
1767  		}
1768  
1769  		/*
1770  		 * At this point we no longer have a reloc_control, so we can't
1771  		 * depend on btrfs_init_reloc_root to update our last_trans.
1772  		 *
1773  		 * But that's ok, we started the trans handle on our
1774  		 * corresponding fs_root, which means it's been added to the
1775  		 * dirty list.  At commit time we'll still call
1776  		 * btrfs_update_reloc_root() and update our root item
1777  		 * appropriately.
1778  		 */
1779  		reloc_root->last_trans = trans->transid;
1780  		trans->block_rsv = rc->block_rsv;
1781  
1782  		replaced = 0;
1783  		max_level = level;
1784  
1785  		ret = walk_down_reloc_tree(reloc_root, path, &level);
1786  		if (ret < 0)
1787  			goto out;
1788  		if (ret > 0)
1789  			break;
1790  
1791  		if (!find_next_key(path, level, &key) &&
1792  		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1793  			ret = 0;
1794  		} else {
1795  			ret = replace_path(trans, rc, root, reloc_root, path,
1796  					   &next_key, level, max_level);
1797  		}
1798  		if (ret < 0)
1799  			goto out;
1800  		if (ret > 0) {
1801  			level = ret;
1802  			btrfs_node_key_to_cpu(path->nodes[level], &key,
1803  					      path->slots[level]);
1804  			replaced = 1;
1805  		}
1806  
1807  		ret = walk_up_reloc_tree(reloc_root, path, &level);
1808  		if (ret > 0)
1809  			break;
1810  
1811  		BUG_ON(level == 0);
1812  		/*
1813  		 * save the merging progress in the drop_progress.
1814  		 * this is OK since root refs == 1 in this case.
1815  		 */
1816  		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1817  			       path->slots[level]);
1818  		btrfs_set_root_drop_level(root_item, level);
1819  
1820  		btrfs_end_transaction_throttle(trans);
1821  		trans = NULL;
1822  
1823  		btrfs_btree_balance_dirty(fs_info);
1824  
1825  		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1826  			invalidate_extent_cache(root, &key, &next_key);
1827  	}
1828  
1829  	/*
1830  	 * handle the case only one block in the fs tree need to be
1831  	 * relocated and the block is tree root.
1832  	 */
1833  	leaf = btrfs_lock_root_node(root);
1834  	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1835  			      BTRFS_NESTING_COW);
1836  	btrfs_tree_unlock(leaf);
1837  	free_extent_buffer(leaf);
1838  out:
1839  	btrfs_free_path(path);
1840  
1841  	if (ret == 0) {
1842  		ret = insert_dirty_subvol(trans, rc, root);
1843  		if (ret)
1844  			btrfs_abort_transaction(trans, ret);
1845  	}
1846  
1847  	if (trans)
1848  		btrfs_end_transaction_throttle(trans);
1849  
1850  	btrfs_btree_balance_dirty(fs_info);
1851  
1852  	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1853  		invalidate_extent_cache(root, &key, &next_key);
1854  
1855  	return ret;
1856  }
1857  
1858  static noinline_for_stack
prepare_to_merge(struct reloc_control * rc,int err)1859  int prepare_to_merge(struct reloc_control *rc, int err)
1860  {
1861  	struct btrfs_root *root = rc->extent_root;
1862  	struct btrfs_fs_info *fs_info = root->fs_info;
1863  	struct btrfs_root *reloc_root;
1864  	struct btrfs_trans_handle *trans;
1865  	LIST_HEAD(reloc_roots);
1866  	u64 num_bytes = 0;
1867  	int ret;
1868  
1869  	mutex_lock(&fs_info->reloc_mutex);
1870  	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1871  	rc->merging_rsv_size += rc->nodes_relocated * 2;
1872  	mutex_unlock(&fs_info->reloc_mutex);
1873  
1874  again:
1875  	if (!err) {
1876  		num_bytes = rc->merging_rsv_size;
1877  		ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
1878  					  BTRFS_RESERVE_FLUSH_ALL);
1879  		if (ret)
1880  			err = ret;
1881  	}
1882  
1883  	trans = btrfs_join_transaction(rc->extent_root);
1884  	if (IS_ERR(trans)) {
1885  		if (!err)
1886  			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1887  						num_bytes, NULL);
1888  		return PTR_ERR(trans);
1889  	}
1890  
1891  	if (!err) {
1892  		if (num_bytes != rc->merging_rsv_size) {
1893  			btrfs_end_transaction(trans);
1894  			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1895  						num_bytes, NULL);
1896  			goto again;
1897  		}
1898  	}
1899  
1900  	rc->merge_reloc_tree = 1;
1901  
1902  	while (!list_empty(&rc->reloc_roots)) {
1903  		reloc_root = list_entry(rc->reloc_roots.next,
1904  					struct btrfs_root, root_list);
1905  		list_del_init(&reloc_root->root_list);
1906  
1907  		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1908  				false);
1909  		if (IS_ERR(root)) {
1910  			/*
1911  			 * Even if we have an error we need this reloc root
1912  			 * back on our list so we can clean up properly.
1913  			 */
1914  			list_add(&reloc_root->root_list, &reloc_roots);
1915  			btrfs_abort_transaction(trans, (int)PTR_ERR(root));
1916  			if (!err)
1917  				err = PTR_ERR(root);
1918  			break;
1919  		}
1920  
1921  		if (unlikely(root->reloc_root != reloc_root)) {
1922  			if (root->reloc_root) {
1923  				btrfs_err(fs_info,
1924  "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
1925  					  root->root_key.objectid,
1926  					  root->reloc_root->root_key.objectid,
1927  					  root->reloc_root->root_key.type,
1928  					  root->reloc_root->root_key.offset,
1929  					  btrfs_root_generation(
1930  						  &root->reloc_root->root_item),
1931  					  reloc_root->root_key.objectid,
1932  					  reloc_root->root_key.type,
1933  					  reloc_root->root_key.offset,
1934  					  btrfs_root_generation(
1935  						  &reloc_root->root_item));
1936  			} else {
1937  				btrfs_err(fs_info,
1938  "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
1939  					  root->root_key.objectid,
1940  					  reloc_root->root_key.objectid,
1941  					  reloc_root->root_key.type,
1942  					  reloc_root->root_key.offset,
1943  					  btrfs_root_generation(
1944  						  &reloc_root->root_item));
1945  			}
1946  			list_add(&reloc_root->root_list, &reloc_roots);
1947  			btrfs_put_root(root);
1948  			btrfs_abort_transaction(trans, -EUCLEAN);
1949  			if (!err)
1950  				err = -EUCLEAN;
1951  			break;
1952  		}
1953  
1954  		/*
1955  		 * set reference count to 1, so btrfs_recover_relocation
1956  		 * knows it should resumes merging
1957  		 */
1958  		if (!err)
1959  			btrfs_set_root_refs(&reloc_root->root_item, 1);
1960  		ret = btrfs_update_reloc_root(trans, root);
1961  
1962  		/*
1963  		 * Even if we have an error we need this reloc root back on our
1964  		 * list so we can clean up properly.
1965  		 */
1966  		list_add(&reloc_root->root_list, &reloc_roots);
1967  		btrfs_put_root(root);
1968  
1969  		if (ret) {
1970  			btrfs_abort_transaction(trans, ret);
1971  			if (!err)
1972  				err = ret;
1973  			break;
1974  		}
1975  	}
1976  
1977  	list_splice(&reloc_roots, &rc->reloc_roots);
1978  
1979  	if (!err)
1980  		err = btrfs_commit_transaction(trans);
1981  	else
1982  		btrfs_end_transaction(trans);
1983  	return err;
1984  }
1985  
1986  static noinline_for_stack
free_reloc_roots(struct list_head * list)1987  void free_reloc_roots(struct list_head *list)
1988  {
1989  	struct btrfs_root *reloc_root, *tmp;
1990  
1991  	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1992  		__del_reloc_root(reloc_root);
1993  }
1994  
1995  static noinline_for_stack
merge_reloc_roots(struct reloc_control * rc)1996  void merge_reloc_roots(struct reloc_control *rc)
1997  {
1998  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1999  	struct btrfs_root *root;
2000  	struct btrfs_root *reloc_root;
2001  	LIST_HEAD(reloc_roots);
2002  	int found = 0;
2003  	int ret = 0;
2004  again:
2005  	root = rc->extent_root;
2006  
2007  	/*
2008  	 * this serializes us with btrfs_record_root_in_transaction,
2009  	 * we have to make sure nobody is in the middle of
2010  	 * adding their roots to the list while we are
2011  	 * doing this splice
2012  	 */
2013  	mutex_lock(&fs_info->reloc_mutex);
2014  	list_splice_init(&rc->reloc_roots, &reloc_roots);
2015  	mutex_unlock(&fs_info->reloc_mutex);
2016  
2017  	while (!list_empty(&reloc_roots)) {
2018  		found = 1;
2019  		reloc_root = list_entry(reloc_roots.next,
2020  					struct btrfs_root, root_list);
2021  
2022  		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
2023  					 false);
2024  		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2025  			if (WARN_ON(IS_ERR(root))) {
2026  				/*
2027  				 * For recovery we read the fs roots on mount,
2028  				 * and if we didn't find the root then we marked
2029  				 * the reloc root as a garbage root.  For normal
2030  				 * relocation obviously the root should exist in
2031  				 * memory.  However there's no reason we can't
2032  				 * handle the error properly here just in case.
2033  				 */
2034  				ret = PTR_ERR(root);
2035  				goto out;
2036  			}
2037  			if (WARN_ON(root->reloc_root != reloc_root)) {
2038  				/*
2039  				 * This can happen if on-disk metadata has some
2040  				 * corruption, e.g. bad reloc tree key offset.
2041  				 */
2042  				ret = -EINVAL;
2043  				goto out;
2044  			}
2045  			ret = merge_reloc_root(rc, root);
2046  			btrfs_put_root(root);
2047  			if (ret) {
2048  				if (list_empty(&reloc_root->root_list))
2049  					list_add_tail(&reloc_root->root_list,
2050  						      &reloc_roots);
2051  				goto out;
2052  			}
2053  		} else {
2054  			if (!IS_ERR(root)) {
2055  				if (root->reloc_root == reloc_root) {
2056  					root->reloc_root = NULL;
2057  					btrfs_put_root(reloc_root);
2058  				}
2059  				clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
2060  					  &root->state);
2061  				btrfs_put_root(root);
2062  			}
2063  
2064  			list_del_init(&reloc_root->root_list);
2065  			/* Don't forget to queue this reloc root for cleanup */
2066  			list_add_tail(&reloc_root->reloc_dirty_list,
2067  				      &rc->dirty_subvol_roots);
2068  		}
2069  	}
2070  
2071  	if (found) {
2072  		found = 0;
2073  		goto again;
2074  	}
2075  out:
2076  	if (ret) {
2077  		btrfs_handle_fs_error(fs_info, ret, NULL);
2078  		free_reloc_roots(&reloc_roots);
2079  
2080  		/* new reloc root may be added */
2081  		mutex_lock(&fs_info->reloc_mutex);
2082  		list_splice_init(&rc->reloc_roots, &reloc_roots);
2083  		mutex_unlock(&fs_info->reloc_mutex);
2084  		free_reloc_roots(&reloc_roots);
2085  	}
2086  
2087  	/*
2088  	 * We used to have
2089  	 *
2090  	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2091  	 *
2092  	 * here, but it's wrong.  If we fail to start the transaction in
2093  	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2094  	 * have actually been removed from the reloc_root_tree rb tree.  This is
2095  	 * fine because we're bailing here, and we hold a reference on the root
2096  	 * for the list that holds it, so these roots will be cleaned up when we
2097  	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2098  	 * will be cleaned up on unmount.
2099  	 *
2100  	 * The remaining nodes will be cleaned up by free_reloc_control.
2101  	 */
2102  }
2103  
free_block_list(struct rb_root * blocks)2104  static void free_block_list(struct rb_root *blocks)
2105  {
2106  	struct tree_block *block;
2107  	struct rb_node *rb_node;
2108  	while ((rb_node = rb_first(blocks))) {
2109  		block = rb_entry(rb_node, struct tree_block, rb_node);
2110  		rb_erase(rb_node, blocks);
2111  		kfree(block);
2112  	}
2113  }
2114  
record_reloc_root_in_trans(struct btrfs_trans_handle * trans,struct btrfs_root * reloc_root)2115  static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2116  				      struct btrfs_root *reloc_root)
2117  {
2118  	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2119  	struct btrfs_root *root;
2120  	int ret;
2121  
2122  	if (reloc_root->last_trans == trans->transid)
2123  		return 0;
2124  
2125  	root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2126  
2127  	/*
2128  	 * This should succeed, since we can't have a reloc root without having
2129  	 * already looked up the actual root and created the reloc root for this
2130  	 * root.
2131  	 *
2132  	 * However if there's some sort of corruption where we have a ref to a
2133  	 * reloc root without a corresponding root this could return ENOENT.
2134  	 */
2135  	if (IS_ERR(root)) {
2136  		ASSERT(0);
2137  		return PTR_ERR(root);
2138  	}
2139  	if (root->reloc_root != reloc_root) {
2140  		ASSERT(0);
2141  		btrfs_err(fs_info,
2142  			  "root %llu has two reloc roots associated with it",
2143  			  reloc_root->root_key.offset);
2144  		btrfs_put_root(root);
2145  		return -EUCLEAN;
2146  	}
2147  	ret = btrfs_record_root_in_trans(trans, root);
2148  	btrfs_put_root(root);
2149  
2150  	return ret;
2151  }
2152  
2153  static noinline_for_stack
select_reloc_root(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[])2154  struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2155  				     struct reloc_control *rc,
2156  				     struct btrfs_backref_node *node,
2157  				     struct btrfs_backref_edge *edges[])
2158  {
2159  	struct btrfs_backref_node *next;
2160  	struct btrfs_root *root;
2161  	int index = 0;
2162  	int ret;
2163  
2164  	next = node;
2165  	while (1) {
2166  		cond_resched();
2167  		next = walk_up_backref(next, edges, &index);
2168  		root = next->root;
2169  
2170  		/*
2171  		 * If there is no root, then our references for this block are
2172  		 * incomplete, as we should be able to walk all the way up to a
2173  		 * block that is owned by a root.
2174  		 *
2175  		 * This path is only for SHAREABLE roots, so if we come upon a
2176  		 * non-SHAREABLE root then we have backrefs that resolve
2177  		 * improperly.
2178  		 *
2179  		 * Both of these cases indicate file system corruption, or a bug
2180  		 * in the backref walking code.
2181  		 */
2182  		if (!root) {
2183  			ASSERT(0);
2184  			btrfs_err(trans->fs_info,
2185  		"bytenr %llu doesn't have a backref path ending in a root",
2186  				  node->bytenr);
2187  			return ERR_PTR(-EUCLEAN);
2188  		}
2189  		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2190  			ASSERT(0);
2191  			btrfs_err(trans->fs_info,
2192  	"bytenr %llu has multiple refs with one ending in a non-shareable root",
2193  				  node->bytenr);
2194  			return ERR_PTR(-EUCLEAN);
2195  		}
2196  
2197  		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2198  			ret = record_reloc_root_in_trans(trans, root);
2199  			if (ret)
2200  				return ERR_PTR(ret);
2201  			break;
2202  		}
2203  
2204  		ret = btrfs_record_root_in_trans(trans, root);
2205  		if (ret)
2206  			return ERR_PTR(ret);
2207  		root = root->reloc_root;
2208  
2209  		/*
2210  		 * We could have raced with another thread which failed, so
2211  		 * root->reloc_root may not be set, return ENOENT in this case.
2212  		 */
2213  		if (!root)
2214  			return ERR_PTR(-ENOENT);
2215  
2216  		if (next->new_bytenr != root->node->start) {
2217  			/*
2218  			 * We just created the reloc root, so we shouldn't have
2219  			 * ->new_bytenr set and this shouldn't be in the changed
2220  			 *  list.  If it is then we have multiple roots pointing
2221  			 *  at the same bytenr which indicates corruption, or
2222  			 *  we've made a mistake in the backref walking code.
2223  			 */
2224  			ASSERT(next->new_bytenr == 0);
2225  			ASSERT(list_empty(&next->list));
2226  			if (next->new_bytenr || !list_empty(&next->list)) {
2227  				btrfs_err(trans->fs_info,
2228  	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
2229  					  node->bytenr, next->bytenr);
2230  				return ERR_PTR(-EUCLEAN);
2231  			}
2232  
2233  			next->new_bytenr = root->node->start;
2234  			btrfs_put_root(next->root);
2235  			next->root = btrfs_grab_root(root);
2236  			ASSERT(next->root);
2237  			list_add_tail(&next->list,
2238  				      &rc->backref_cache.changed);
2239  			mark_block_processed(rc, next);
2240  			break;
2241  		}
2242  
2243  		WARN_ON(1);
2244  		root = NULL;
2245  		next = walk_down_backref(edges, &index);
2246  		if (!next || next->level <= node->level)
2247  			break;
2248  	}
2249  	if (!root) {
2250  		/*
2251  		 * This can happen if there's fs corruption or if there's a bug
2252  		 * in the backref lookup code.
2253  		 */
2254  		ASSERT(0);
2255  		return ERR_PTR(-ENOENT);
2256  	}
2257  
2258  	next = node;
2259  	/* setup backref node path for btrfs_reloc_cow_block */
2260  	while (1) {
2261  		rc->backref_cache.path[next->level] = next;
2262  		if (--index < 0)
2263  			break;
2264  		next = edges[index]->node[UPPER];
2265  	}
2266  	return root;
2267  }
2268  
2269  /*
2270   * Select a tree root for relocation.
2271   *
2272   * Return NULL if the block is not shareable. We should use do_relocation() in
2273   * this case.
2274   *
2275   * Return a tree root pointer if the block is shareable.
2276   * Return -ENOENT if the block is root of reloc tree.
2277   */
2278  static noinline_for_stack
select_one_root(struct btrfs_backref_node * node)2279  struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2280  {
2281  	struct btrfs_backref_node *next;
2282  	struct btrfs_root *root;
2283  	struct btrfs_root *fs_root = NULL;
2284  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2285  	int index = 0;
2286  
2287  	next = node;
2288  	while (1) {
2289  		cond_resched();
2290  		next = walk_up_backref(next, edges, &index);
2291  		root = next->root;
2292  
2293  		/*
2294  		 * This can occur if we have incomplete extent refs leading all
2295  		 * the way up a particular path, in this case return -EUCLEAN.
2296  		 */
2297  		if (!root)
2298  			return ERR_PTR(-EUCLEAN);
2299  
2300  		/* No other choice for non-shareable tree */
2301  		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2302  			return root;
2303  
2304  		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2305  			fs_root = root;
2306  
2307  		if (next != node)
2308  			return NULL;
2309  
2310  		next = walk_down_backref(edges, &index);
2311  		if (!next || next->level <= node->level)
2312  			break;
2313  	}
2314  
2315  	if (!fs_root)
2316  		return ERR_PTR(-ENOENT);
2317  	return fs_root;
2318  }
2319  
2320  static noinline_for_stack
calcu_metadata_size(struct reloc_control * rc,struct btrfs_backref_node * node,int reserve)2321  u64 calcu_metadata_size(struct reloc_control *rc,
2322  			struct btrfs_backref_node *node, int reserve)
2323  {
2324  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2325  	struct btrfs_backref_node *next = node;
2326  	struct btrfs_backref_edge *edge;
2327  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2328  	u64 num_bytes = 0;
2329  	int index = 0;
2330  
2331  	BUG_ON(reserve && node->processed);
2332  
2333  	while (next) {
2334  		cond_resched();
2335  		while (1) {
2336  			if (next->processed && (reserve || next != node))
2337  				break;
2338  
2339  			num_bytes += fs_info->nodesize;
2340  
2341  			if (list_empty(&next->upper))
2342  				break;
2343  
2344  			edge = list_entry(next->upper.next,
2345  					struct btrfs_backref_edge, list[LOWER]);
2346  			edges[index++] = edge;
2347  			next = edge->node[UPPER];
2348  		}
2349  		next = walk_down_backref(edges, &index);
2350  	}
2351  	return num_bytes;
2352  }
2353  
reserve_metadata_space(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node)2354  static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2355  				  struct reloc_control *rc,
2356  				  struct btrfs_backref_node *node)
2357  {
2358  	struct btrfs_root *root = rc->extent_root;
2359  	struct btrfs_fs_info *fs_info = root->fs_info;
2360  	u64 num_bytes;
2361  	int ret;
2362  	u64 tmp;
2363  
2364  	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2365  
2366  	trans->block_rsv = rc->block_rsv;
2367  	rc->reserved_bytes += num_bytes;
2368  
2369  	/*
2370  	 * We are under a transaction here so we can only do limited flushing.
2371  	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2372  	 * transaction and try to refill when we can flush all the things.
2373  	 */
2374  	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
2375  				     BTRFS_RESERVE_FLUSH_LIMIT);
2376  	if (ret) {
2377  		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2378  		while (tmp <= rc->reserved_bytes)
2379  			tmp <<= 1;
2380  		/*
2381  		 * only one thread can access block_rsv at this point,
2382  		 * so we don't need hold lock to protect block_rsv.
2383  		 * we expand more reservation size here to allow enough
2384  		 * space for relocation and we will return earlier in
2385  		 * enospc case.
2386  		 */
2387  		rc->block_rsv->size = tmp + fs_info->nodesize *
2388  				      RELOCATION_RESERVED_NODES;
2389  		return -EAGAIN;
2390  	}
2391  
2392  	return 0;
2393  }
2394  
2395  /*
2396   * relocate a block tree, and then update pointers in upper level
2397   * blocks that reference the block to point to the new location.
2398   *
2399   * if called by link_to_upper, the block has already been relocated.
2400   * in that case this function just updates pointers.
2401   */
do_relocation(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path,int lowest)2402  static int do_relocation(struct btrfs_trans_handle *trans,
2403  			 struct reloc_control *rc,
2404  			 struct btrfs_backref_node *node,
2405  			 struct btrfs_key *key,
2406  			 struct btrfs_path *path, int lowest)
2407  {
2408  	struct btrfs_backref_node *upper;
2409  	struct btrfs_backref_edge *edge;
2410  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2411  	struct btrfs_root *root;
2412  	struct extent_buffer *eb;
2413  	u32 blocksize;
2414  	u64 bytenr;
2415  	int slot;
2416  	int ret = 0;
2417  
2418  	/*
2419  	 * If we are lowest then this is the first time we're processing this
2420  	 * block, and thus shouldn't have an eb associated with it yet.
2421  	 */
2422  	ASSERT(!lowest || !node->eb);
2423  
2424  	path->lowest_level = node->level + 1;
2425  	rc->backref_cache.path[node->level] = node;
2426  	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2427  		struct btrfs_ref ref = { 0 };
2428  
2429  		cond_resched();
2430  
2431  		upper = edge->node[UPPER];
2432  		root = select_reloc_root(trans, rc, upper, edges);
2433  		if (IS_ERR(root)) {
2434  			ret = PTR_ERR(root);
2435  			goto next;
2436  		}
2437  
2438  		if (upper->eb && !upper->locked) {
2439  			if (!lowest) {
2440  				ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2441  				if (ret < 0)
2442  					goto next;
2443  				BUG_ON(ret);
2444  				bytenr = btrfs_node_blockptr(upper->eb, slot);
2445  				if (node->eb->start == bytenr)
2446  					goto next;
2447  			}
2448  			btrfs_backref_drop_node_buffer(upper);
2449  		}
2450  
2451  		if (!upper->eb) {
2452  			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2453  			if (ret) {
2454  				if (ret > 0)
2455  					ret = -ENOENT;
2456  
2457  				btrfs_release_path(path);
2458  				break;
2459  			}
2460  
2461  			if (!upper->eb) {
2462  				upper->eb = path->nodes[upper->level];
2463  				path->nodes[upper->level] = NULL;
2464  			} else {
2465  				BUG_ON(upper->eb != path->nodes[upper->level]);
2466  			}
2467  
2468  			upper->locked = 1;
2469  			path->locks[upper->level] = 0;
2470  
2471  			slot = path->slots[upper->level];
2472  			btrfs_release_path(path);
2473  		} else {
2474  			ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2475  			if (ret < 0)
2476  				goto next;
2477  			BUG_ON(ret);
2478  		}
2479  
2480  		bytenr = btrfs_node_blockptr(upper->eb, slot);
2481  		if (lowest) {
2482  			if (bytenr != node->bytenr) {
2483  				btrfs_err(root->fs_info,
2484  		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2485  					  bytenr, node->bytenr, slot,
2486  					  upper->eb->start);
2487  				ret = -EIO;
2488  				goto next;
2489  			}
2490  		} else {
2491  			if (node->eb->start == bytenr)
2492  				goto next;
2493  		}
2494  
2495  		blocksize = root->fs_info->nodesize;
2496  		eb = btrfs_read_node_slot(upper->eb, slot);
2497  		if (IS_ERR(eb)) {
2498  			ret = PTR_ERR(eb);
2499  			goto next;
2500  		}
2501  		btrfs_tree_lock(eb);
2502  
2503  		if (!node->eb) {
2504  			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2505  					      slot, &eb, BTRFS_NESTING_COW);
2506  			btrfs_tree_unlock(eb);
2507  			free_extent_buffer(eb);
2508  			if (ret < 0)
2509  				goto next;
2510  			/*
2511  			 * We've just COWed this block, it should have updated
2512  			 * the correct backref node entry.
2513  			 */
2514  			ASSERT(node->eb == eb);
2515  		} else {
2516  			btrfs_set_node_blockptr(upper->eb, slot,
2517  						node->eb->start);
2518  			btrfs_set_node_ptr_generation(upper->eb, slot,
2519  						      trans->transid);
2520  			btrfs_mark_buffer_dirty(upper->eb);
2521  
2522  			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2523  					       node->eb->start, blocksize,
2524  					       upper->eb->start);
2525  			btrfs_init_tree_ref(&ref, node->level,
2526  					    btrfs_header_owner(upper->eb),
2527  					    root->root_key.objectid, false);
2528  			ret = btrfs_inc_extent_ref(trans, &ref);
2529  			if (!ret)
2530  				ret = btrfs_drop_subtree(trans, root, eb,
2531  							 upper->eb);
2532  			if (ret)
2533  				btrfs_abort_transaction(trans, ret);
2534  		}
2535  next:
2536  		if (!upper->pending)
2537  			btrfs_backref_drop_node_buffer(upper);
2538  		else
2539  			btrfs_backref_unlock_node_buffer(upper);
2540  		if (ret)
2541  			break;
2542  	}
2543  
2544  	if (!ret && node->pending) {
2545  		btrfs_backref_drop_node_buffer(node);
2546  		list_move_tail(&node->list, &rc->backref_cache.changed);
2547  		node->pending = 0;
2548  	}
2549  
2550  	path->lowest_level = 0;
2551  
2552  	/*
2553  	 * We should have allocated all of our space in the block rsv and thus
2554  	 * shouldn't ENOSPC.
2555  	 */
2556  	ASSERT(ret != -ENOSPC);
2557  	return ret;
2558  }
2559  
link_to_upper(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_path * path)2560  static int link_to_upper(struct btrfs_trans_handle *trans,
2561  			 struct reloc_control *rc,
2562  			 struct btrfs_backref_node *node,
2563  			 struct btrfs_path *path)
2564  {
2565  	struct btrfs_key key;
2566  
2567  	btrfs_node_key_to_cpu(node->eb, &key, 0);
2568  	return do_relocation(trans, rc, node, &key, path, 0);
2569  }
2570  
finish_pending_nodes(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_path * path,int err)2571  static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2572  				struct reloc_control *rc,
2573  				struct btrfs_path *path, int err)
2574  {
2575  	LIST_HEAD(list);
2576  	struct btrfs_backref_cache *cache = &rc->backref_cache;
2577  	struct btrfs_backref_node *node;
2578  	int level;
2579  	int ret;
2580  
2581  	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2582  		while (!list_empty(&cache->pending[level])) {
2583  			node = list_entry(cache->pending[level].next,
2584  					  struct btrfs_backref_node, list);
2585  			list_move_tail(&node->list, &list);
2586  			BUG_ON(!node->pending);
2587  
2588  			if (!err) {
2589  				ret = link_to_upper(trans, rc, node, path);
2590  				if (ret < 0)
2591  					err = ret;
2592  			}
2593  		}
2594  		list_splice_init(&list, &cache->pending[level]);
2595  	}
2596  	return err;
2597  }
2598  
2599  /*
2600   * mark a block and all blocks directly/indirectly reference the block
2601   * as processed.
2602   */
update_processed_blocks(struct reloc_control * rc,struct btrfs_backref_node * node)2603  static void update_processed_blocks(struct reloc_control *rc,
2604  				    struct btrfs_backref_node *node)
2605  {
2606  	struct btrfs_backref_node *next = node;
2607  	struct btrfs_backref_edge *edge;
2608  	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2609  	int index = 0;
2610  
2611  	while (next) {
2612  		cond_resched();
2613  		while (1) {
2614  			if (next->processed)
2615  				break;
2616  
2617  			mark_block_processed(rc, next);
2618  
2619  			if (list_empty(&next->upper))
2620  				break;
2621  
2622  			edge = list_entry(next->upper.next,
2623  					struct btrfs_backref_edge, list[LOWER]);
2624  			edges[index++] = edge;
2625  			next = edge->node[UPPER];
2626  		}
2627  		next = walk_down_backref(edges, &index);
2628  	}
2629  }
2630  
tree_block_processed(u64 bytenr,struct reloc_control * rc)2631  static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2632  {
2633  	u32 blocksize = rc->extent_root->fs_info->nodesize;
2634  
2635  	if (test_range_bit(&rc->processed_blocks, bytenr,
2636  			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2637  		return 1;
2638  	return 0;
2639  }
2640  
get_tree_block_key(struct btrfs_fs_info * fs_info,struct tree_block * block)2641  static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2642  			      struct tree_block *block)
2643  {
2644  	struct btrfs_tree_parent_check check = {
2645  		.level = block->level,
2646  		.owner_root = block->owner,
2647  		.transid = block->key.offset
2648  	};
2649  	struct extent_buffer *eb;
2650  
2651  	eb = read_tree_block(fs_info, block->bytenr, &check);
2652  	if (IS_ERR(eb))
2653  		return PTR_ERR(eb);
2654  	if (!extent_buffer_uptodate(eb)) {
2655  		free_extent_buffer(eb);
2656  		return -EIO;
2657  	}
2658  	if (block->level == 0)
2659  		btrfs_item_key_to_cpu(eb, &block->key, 0);
2660  	else
2661  		btrfs_node_key_to_cpu(eb, &block->key, 0);
2662  	free_extent_buffer(eb);
2663  	block->key_ready = 1;
2664  	return 0;
2665  }
2666  
2667  /*
2668   * helper function to relocate a tree block
2669   */
relocate_tree_block(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path)2670  static int relocate_tree_block(struct btrfs_trans_handle *trans,
2671  				struct reloc_control *rc,
2672  				struct btrfs_backref_node *node,
2673  				struct btrfs_key *key,
2674  				struct btrfs_path *path)
2675  {
2676  	struct btrfs_root *root;
2677  	int ret = 0;
2678  
2679  	if (!node)
2680  		return 0;
2681  
2682  	/*
2683  	 * If we fail here we want to drop our backref_node because we are going
2684  	 * to start over and regenerate the tree for it.
2685  	 */
2686  	ret = reserve_metadata_space(trans, rc, node);
2687  	if (ret)
2688  		goto out;
2689  
2690  	BUG_ON(node->processed);
2691  	root = select_one_root(node);
2692  	if (IS_ERR(root)) {
2693  		ret = PTR_ERR(root);
2694  
2695  		/* See explanation in select_one_root for the -EUCLEAN case. */
2696  		ASSERT(ret == -ENOENT);
2697  		if (ret == -ENOENT) {
2698  			ret = 0;
2699  			update_processed_blocks(rc, node);
2700  		}
2701  		goto out;
2702  	}
2703  
2704  	if (root) {
2705  		if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2706  			/*
2707  			 * This block was the root block of a root, and this is
2708  			 * the first time we're processing the block and thus it
2709  			 * should not have had the ->new_bytenr modified and
2710  			 * should have not been included on the changed list.
2711  			 *
2712  			 * However in the case of corruption we could have
2713  			 * multiple refs pointing to the same block improperly,
2714  			 * and thus we would trip over these checks.  ASSERT()
2715  			 * for the developer case, because it could indicate a
2716  			 * bug in the backref code, however error out for a
2717  			 * normal user in the case of corruption.
2718  			 */
2719  			ASSERT(node->new_bytenr == 0);
2720  			ASSERT(list_empty(&node->list));
2721  			if (node->new_bytenr || !list_empty(&node->list)) {
2722  				btrfs_err(root->fs_info,
2723  				  "bytenr %llu has improper references to it",
2724  					  node->bytenr);
2725  				ret = -EUCLEAN;
2726  				goto out;
2727  			}
2728  			ret = btrfs_record_root_in_trans(trans, root);
2729  			if (ret)
2730  				goto out;
2731  			/*
2732  			 * Another thread could have failed, need to check if we
2733  			 * have reloc_root actually set.
2734  			 */
2735  			if (!root->reloc_root) {
2736  				ret = -ENOENT;
2737  				goto out;
2738  			}
2739  			root = root->reloc_root;
2740  			node->new_bytenr = root->node->start;
2741  			btrfs_put_root(node->root);
2742  			node->root = btrfs_grab_root(root);
2743  			ASSERT(node->root);
2744  			list_add_tail(&node->list, &rc->backref_cache.changed);
2745  		} else {
2746  			path->lowest_level = node->level;
2747  			if (root == root->fs_info->chunk_root)
2748  				btrfs_reserve_chunk_metadata(trans, false);
2749  			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2750  			btrfs_release_path(path);
2751  			if (root == root->fs_info->chunk_root)
2752  				btrfs_trans_release_chunk_metadata(trans);
2753  			if (ret > 0)
2754  				ret = 0;
2755  		}
2756  		if (!ret)
2757  			update_processed_blocks(rc, node);
2758  	} else {
2759  		ret = do_relocation(trans, rc, node, key, path, 1);
2760  	}
2761  out:
2762  	if (ret || node->level == 0 || node->cowonly)
2763  		btrfs_backref_cleanup_node(&rc->backref_cache, node);
2764  	return ret;
2765  }
2766  
2767  /*
2768   * relocate a list of blocks
2769   */
2770  static noinline_for_stack
relocate_tree_blocks(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct rb_root * blocks)2771  int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2772  			 struct reloc_control *rc, struct rb_root *blocks)
2773  {
2774  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2775  	struct btrfs_backref_node *node;
2776  	struct btrfs_path *path;
2777  	struct tree_block *block;
2778  	struct tree_block *next;
2779  	int ret;
2780  	int err = 0;
2781  
2782  	path = btrfs_alloc_path();
2783  	if (!path) {
2784  		err = -ENOMEM;
2785  		goto out_free_blocks;
2786  	}
2787  
2788  	/* Kick in readahead for tree blocks with missing keys */
2789  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2790  		if (!block->key_ready)
2791  			btrfs_readahead_tree_block(fs_info, block->bytenr,
2792  						   block->owner, 0,
2793  						   block->level);
2794  	}
2795  
2796  	/* Get first keys */
2797  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2798  		if (!block->key_ready) {
2799  			err = get_tree_block_key(fs_info, block);
2800  			if (err)
2801  				goto out_free_path;
2802  		}
2803  	}
2804  
2805  	/* Do tree relocation */
2806  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2807  		node = build_backref_tree(trans, rc, &block->key,
2808  					  block->level, block->bytenr);
2809  		if (IS_ERR(node)) {
2810  			err = PTR_ERR(node);
2811  			goto out;
2812  		}
2813  
2814  		ret = relocate_tree_block(trans, rc, node, &block->key,
2815  					  path);
2816  		if (ret < 0) {
2817  			err = ret;
2818  			break;
2819  		}
2820  	}
2821  out:
2822  	err = finish_pending_nodes(trans, rc, path, err);
2823  
2824  out_free_path:
2825  	btrfs_free_path(path);
2826  out_free_blocks:
2827  	free_block_list(blocks);
2828  	return err;
2829  }
2830  
prealloc_file_extent_cluster(struct btrfs_inode * inode,struct file_extent_cluster * cluster)2831  static noinline_for_stack int prealloc_file_extent_cluster(
2832  				struct btrfs_inode *inode,
2833  				struct file_extent_cluster *cluster)
2834  {
2835  	u64 alloc_hint = 0;
2836  	u64 start;
2837  	u64 end;
2838  	u64 offset = inode->index_cnt;
2839  	u64 num_bytes;
2840  	int nr;
2841  	int ret = 0;
2842  	u64 i_size = i_size_read(&inode->vfs_inode);
2843  	u64 prealloc_start = cluster->start - offset;
2844  	u64 prealloc_end = cluster->end - offset;
2845  	u64 cur_offset = prealloc_start;
2846  
2847  	/*
2848  	 * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
2849  	 * This means the range [i_size, PAGE_END + 1) is filled with zeros by
2850  	 * btrfs_do_readpage() call of previously relocated file cluster.
2851  	 *
2852  	 * If the current cluster starts in the above range, btrfs_do_readpage()
2853  	 * will skip the read, and relocate_one_page() will later writeback
2854  	 * the padding zeros as new data, causing data corruption.
2855  	 *
2856  	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
2857  	 */
2858  	if (!PAGE_ALIGNED(i_size)) {
2859  		struct address_space *mapping = inode->vfs_inode.i_mapping;
2860  		struct btrfs_fs_info *fs_info = inode->root->fs_info;
2861  		const u32 sectorsize = fs_info->sectorsize;
2862  		struct page *page;
2863  
2864  		ASSERT(sectorsize < PAGE_SIZE);
2865  		ASSERT(IS_ALIGNED(i_size, sectorsize));
2866  
2867  		/*
2868  		 * Subpage can't handle page with DIRTY but without UPTODATE
2869  		 * bit as it can lead to the following deadlock:
2870  		 *
2871  		 * btrfs_read_folio()
2872  		 * | Page already *locked*
2873  		 * |- btrfs_lock_and_flush_ordered_range()
2874  		 *    |- btrfs_start_ordered_extent()
2875  		 *       |- extent_write_cache_pages()
2876  		 *          |- lock_page()
2877  		 *             We try to lock the page we already hold.
2878  		 *
2879  		 * Here we just writeback the whole data reloc inode, so that
2880  		 * we will be ensured to have no dirty range in the page, and
2881  		 * are safe to clear the uptodate bits.
2882  		 *
2883  		 * This shouldn't cause too much overhead, as we need to write
2884  		 * the data back anyway.
2885  		 */
2886  		ret = filemap_write_and_wait(mapping);
2887  		if (ret < 0)
2888  			return ret;
2889  
2890  		clear_extent_bits(&inode->io_tree, i_size,
2891  				  round_up(i_size, PAGE_SIZE) - 1,
2892  				  EXTENT_UPTODATE);
2893  		page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
2894  		/*
2895  		 * If page is freed we don't need to do anything then, as we
2896  		 * will re-read the whole page anyway.
2897  		 */
2898  		if (page) {
2899  			btrfs_subpage_clear_uptodate(fs_info, page, i_size,
2900  					round_up(i_size, PAGE_SIZE) - i_size);
2901  			unlock_page(page);
2902  			put_page(page);
2903  		}
2904  	}
2905  
2906  	BUG_ON(cluster->start != cluster->boundary[0]);
2907  	ret = btrfs_alloc_data_chunk_ondemand(inode,
2908  					      prealloc_end + 1 - prealloc_start);
2909  	if (ret)
2910  		return ret;
2911  
2912  	btrfs_inode_lock(inode, 0);
2913  	for (nr = 0; nr < cluster->nr; nr++) {
2914  		struct extent_state *cached_state = NULL;
2915  
2916  		start = cluster->boundary[nr] - offset;
2917  		if (nr + 1 < cluster->nr)
2918  			end = cluster->boundary[nr + 1] - 1 - offset;
2919  		else
2920  			end = cluster->end - offset;
2921  
2922  		lock_extent(&inode->io_tree, start, end, &cached_state);
2923  		num_bytes = end + 1 - start;
2924  		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2925  						num_bytes, num_bytes,
2926  						end + 1, &alloc_hint);
2927  		cur_offset = end + 1;
2928  		unlock_extent(&inode->io_tree, start, end, &cached_state);
2929  		if (ret)
2930  			break;
2931  	}
2932  	btrfs_inode_unlock(inode, 0);
2933  
2934  	if (cur_offset < prealloc_end)
2935  		btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2936  					       prealloc_end + 1 - cur_offset);
2937  	return ret;
2938  }
2939  
setup_relocation_extent_mapping(struct inode * inode,u64 start,u64 end,u64 block_start)2940  static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
2941  				u64 start, u64 end, u64 block_start)
2942  {
2943  	struct extent_map *em;
2944  	struct extent_state *cached_state = NULL;
2945  	int ret = 0;
2946  
2947  	em = alloc_extent_map();
2948  	if (!em)
2949  		return -ENOMEM;
2950  
2951  	em->start = start;
2952  	em->len = end + 1 - start;
2953  	em->block_len = em->len;
2954  	em->block_start = block_start;
2955  	set_bit(EXTENT_FLAG_PINNED, &em->flags);
2956  
2957  	lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2958  	ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
2959  	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2960  	free_extent_map(em);
2961  
2962  	return ret;
2963  }
2964  
2965  /*
2966   * Allow error injection to test balance/relocation cancellation
2967   */
btrfs_should_cancel_balance(struct btrfs_fs_info * fs_info)2968  noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2969  {
2970  	return atomic_read(&fs_info->balance_cancel_req) ||
2971  		atomic_read(&fs_info->reloc_cancel_req) ||
2972  		fatal_signal_pending(current);
2973  }
2974  ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2975  
get_cluster_boundary_end(struct file_extent_cluster * cluster,int cluster_nr)2976  static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
2977  				    int cluster_nr)
2978  {
2979  	/* Last extent, use cluster end directly */
2980  	if (cluster_nr >= cluster->nr - 1)
2981  		return cluster->end;
2982  
2983  	/* Use next boundary start*/
2984  	return cluster->boundary[cluster_nr + 1] - 1;
2985  }
2986  
relocate_one_page(struct inode * inode,struct file_ra_state * ra,struct file_extent_cluster * cluster,int * cluster_nr,unsigned long page_index)2987  static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
2988  			     struct file_extent_cluster *cluster,
2989  			     int *cluster_nr, unsigned long page_index)
2990  {
2991  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2992  	u64 offset = BTRFS_I(inode)->index_cnt;
2993  	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
2994  	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2995  	struct page *page;
2996  	u64 page_start;
2997  	u64 page_end;
2998  	u64 cur;
2999  	int ret;
3000  
3001  	ASSERT(page_index <= last_index);
3002  	page = find_lock_page(inode->i_mapping, page_index);
3003  	if (!page) {
3004  		page_cache_sync_readahead(inode->i_mapping, ra, NULL,
3005  				page_index, last_index + 1 - page_index);
3006  		page = find_or_create_page(inode->i_mapping, page_index, mask);
3007  		if (!page)
3008  			return -ENOMEM;
3009  	}
3010  
3011  	if (PageReadahead(page))
3012  		page_cache_async_readahead(inode->i_mapping, ra, NULL,
3013  				page_folio(page), page_index,
3014  				last_index + 1 - page_index);
3015  
3016  	if (!PageUptodate(page)) {
3017  		btrfs_read_folio(NULL, page_folio(page));
3018  		lock_page(page);
3019  		if (!PageUptodate(page)) {
3020  			ret = -EIO;
3021  			goto release_page;
3022  		}
3023  	}
3024  
3025  	/*
3026  	 * We could have lost page private when we dropped the lock to read the
3027  	 * page above, make sure we set_page_extent_mapped here so we have any
3028  	 * of the subpage blocksize stuff we need in place.
3029  	 */
3030  	ret = set_page_extent_mapped(page);
3031  	if (ret < 0)
3032  		goto release_page;
3033  
3034  	page_start = page_offset(page);
3035  	page_end = page_start + PAGE_SIZE - 1;
3036  
3037  	/*
3038  	 * Start from the cluster, as for subpage case, the cluster can start
3039  	 * inside the page.
3040  	 */
3041  	cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
3042  	while (cur <= page_end) {
3043  		struct extent_state *cached_state = NULL;
3044  		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
3045  		u64 extent_end = get_cluster_boundary_end(cluster,
3046  						*cluster_nr) - offset;
3047  		u64 clamped_start = max(page_start, extent_start);
3048  		u64 clamped_end = min(page_end, extent_end);
3049  		u32 clamped_len = clamped_end + 1 - clamped_start;
3050  
3051  		/* Reserve metadata for this range */
3052  		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3053  						      clamped_len, clamped_len,
3054  						      false);
3055  		if (ret)
3056  			goto release_page;
3057  
3058  		/* Mark the range delalloc and dirty for later writeback */
3059  		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
3060  			    &cached_state);
3061  		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
3062  						clamped_end, 0, &cached_state);
3063  		if (ret) {
3064  			clear_extent_bit(&BTRFS_I(inode)->io_tree,
3065  					 clamped_start, clamped_end,
3066  					 EXTENT_LOCKED | EXTENT_BOUNDARY,
3067  					 &cached_state);
3068  			btrfs_delalloc_release_metadata(BTRFS_I(inode),
3069  							clamped_len, true);
3070  			btrfs_delalloc_release_extents(BTRFS_I(inode),
3071  						       clamped_len);
3072  			goto release_page;
3073  		}
3074  		btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
3075  
3076  		/*
3077  		 * Set the boundary if it's inside the page.
3078  		 * Data relocation requires the destination extents to have the
3079  		 * same size as the source.
3080  		 * EXTENT_BOUNDARY bit prevents current extent from being merged
3081  		 * with previous extent.
3082  		 */
3083  		if (in_range(cluster->boundary[*cluster_nr] - offset,
3084  			     page_start, PAGE_SIZE)) {
3085  			u64 boundary_start = cluster->boundary[*cluster_nr] -
3086  						offset;
3087  			u64 boundary_end = boundary_start +
3088  					   fs_info->sectorsize - 1;
3089  
3090  			set_extent_bit(&BTRFS_I(inode)->io_tree,
3091  				       boundary_start, boundary_end,
3092  				       EXTENT_BOUNDARY, NULL);
3093  		}
3094  		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
3095  			      &cached_state);
3096  		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
3097  		cur += clamped_len;
3098  
3099  		/* Crossed extent end, go to next extent */
3100  		if (cur >= extent_end) {
3101  			(*cluster_nr)++;
3102  			/* Just finished the last extent of the cluster, exit. */
3103  			if (*cluster_nr >= cluster->nr)
3104  				break;
3105  		}
3106  	}
3107  	unlock_page(page);
3108  	put_page(page);
3109  
3110  	balance_dirty_pages_ratelimited(inode->i_mapping);
3111  	btrfs_throttle(fs_info);
3112  	if (btrfs_should_cancel_balance(fs_info))
3113  		ret = -ECANCELED;
3114  	return ret;
3115  
3116  release_page:
3117  	unlock_page(page);
3118  	put_page(page);
3119  	return ret;
3120  }
3121  
relocate_file_extent_cluster(struct inode * inode,struct file_extent_cluster * cluster)3122  static int relocate_file_extent_cluster(struct inode *inode,
3123  					struct file_extent_cluster *cluster)
3124  {
3125  	u64 offset = BTRFS_I(inode)->index_cnt;
3126  	unsigned long index;
3127  	unsigned long last_index;
3128  	struct file_ra_state *ra;
3129  	int cluster_nr = 0;
3130  	int ret = 0;
3131  
3132  	if (!cluster->nr)
3133  		return 0;
3134  
3135  	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3136  	if (!ra)
3137  		return -ENOMEM;
3138  
3139  	ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
3140  	if (ret)
3141  		goto out;
3142  
3143  	file_ra_state_init(ra, inode->i_mapping);
3144  
3145  	ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
3146  				   cluster->end - offset, cluster->start);
3147  	if (ret)
3148  		goto out;
3149  
3150  	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3151  	for (index = (cluster->start - offset) >> PAGE_SHIFT;
3152  	     index <= last_index && !ret; index++)
3153  		ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
3154  	if (ret == 0)
3155  		WARN_ON(cluster_nr != cluster->nr);
3156  out:
3157  	kfree(ra);
3158  	return ret;
3159  }
3160  
3161  static noinline_for_stack
relocate_data_extent(struct inode * inode,struct btrfs_key * extent_key,struct file_extent_cluster * cluster)3162  int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3163  			 struct file_extent_cluster *cluster)
3164  {
3165  	int ret;
3166  
3167  	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3168  		ret = relocate_file_extent_cluster(inode, cluster);
3169  		if (ret)
3170  			return ret;
3171  		cluster->nr = 0;
3172  	}
3173  
3174  	if (!cluster->nr)
3175  		cluster->start = extent_key->objectid;
3176  	else
3177  		BUG_ON(cluster->nr >= MAX_EXTENTS);
3178  	cluster->end = extent_key->objectid + extent_key->offset - 1;
3179  	cluster->boundary[cluster->nr] = extent_key->objectid;
3180  	cluster->nr++;
3181  
3182  	if (cluster->nr >= MAX_EXTENTS) {
3183  		ret = relocate_file_extent_cluster(inode, cluster);
3184  		if (ret)
3185  			return ret;
3186  		cluster->nr = 0;
3187  	}
3188  	return 0;
3189  }
3190  
3191  /*
3192   * helper to add a tree block to the list.
3193   * the major work is getting the generation and level of the block
3194   */
add_tree_block(struct reloc_control * rc,struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)3195  static int add_tree_block(struct reloc_control *rc,
3196  			  struct btrfs_key *extent_key,
3197  			  struct btrfs_path *path,
3198  			  struct rb_root *blocks)
3199  {
3200  	struct extent_buffer *eb;
3201  	struct btrfs_extent_item *ei;
3202  	struct btrfs_tree_block_info *bi;
3203  	struct tree_block *block;
3204  	struct rb_node *rb_node;
3205  	u32 item_size;
3206  	int level = -1;
3207  	u64 generation;
3208  	u64 owner = 0;
3209  
3210  	eb =  path->nodes[0];
3211  	item_size = btrfs_item_size(eb, path->slots[0]);
3212  
3213  	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3214  	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3215  		unsigned long ptr = 0, end;
3216  
3217  		ei = btrfs_item_ptr(eb, path->slots[0],
3218  				struct btrfs_extent_item);
3219  		end = (unsigned long)ei + item_size;
3220  		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3221  			bi = (struct btrfs_tree_block_info *)(ei + 1);
3222  			level = btrfs_tree_block_level(eb, bi);
3223  			ptr = (unsigned long)(bi + 1);
3224  		} else {
3225  			level = (int)extent_key->offset;
3226  			ptr = (unsigned long)(ei + 1);
3227  		}
3228  		generation = btrfs_extent_generation(eb, ei);
3229  
3230  		/*
3231  		 * We're reading random blocks without knowing their owner ahead
3232  		 * of time.  This is ok most of the time, as all reloc roots and
3233  		 * fs roots have the same lock type.  However normal trees do
3234  		 * not, and the only way to know ahead of time is to read the
3235  		 * inline ref offset.  We know it's an fs root if
3236  		 *
3237  		 * 1. There's more than one ref.
3238  		 * 2. There's a SHARED_DATA_REF_KEY set.
3239  		 * 3. FULL_BACKREF is set on the flags.
3240  		 *
3241  		 * Otherwise it's safe to assume that the ref offset == the
3242  		 * owner of this block, so we can use that when calling
3243  		 * read_tree_block.
3244  		 */
3245  		if (btrfs_extent_refs(eb, ei) == 1 &&
3246  		    !(btrfs_extent_flags(eb, ei) &
3247  		      BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
3248  		    ptr < end) {
3249  			struct btrfs_extent_inline_ref *iref;
3250  			int type;
3251  
3252  			iref = (struct btrfs_extent_inline_ref *)ptr;
3253  			type = btrfs_get_extent_inline_ref_type(eb, iref,
3254  							BTRFS_REF_TYPE_BLOCK);
3255  			if (type == BTRFS_REF_TYPE_INVALID)
3256  				return -EINVAL;
3257  			if (type == BTRFS_TREE_BLOCK_REF_KEY)
3258  				owner = btrfs_extent_inline_ref_offset(eb, iref);
3259  		}
3260  	} else {
3261  		btrfs_print_leaf(eb);
3262  		btrfs_err(rc->block_group->fs_info,
3263  			  "unrecognized tree backref at tree block %llu slot %u",
3264  			  eb->start, path->slots[0]);
3265  		btrfs_release_path(path);
3266  		return -EUCLEAN;
3267  	}
3268  
3269  	btrfs_release_path(path);
3270  
3271  	BUG_ON(level == -1);
3272  
3273  	block = kmalloc(sizeof(*block), GFP_NOFS);
3274  	if (!block)
3275  		return -ENOMEM;
3276  
3277  	block->bytenr = extent_key->objectid;
3278  	block->key.objectid = rc->extent_root->fs_info->nodesize;
3279  	block->key.offset = generation;
3280  	block->level = level;
3281  	block->key_ready = 0;
3282  	block->owner = owner;
3283  
3284  	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3285  	if (rb_node)
3286  		btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
3287  				    -EEXIST);
3288  
3289  	return 0;
3290  }
3291  
3292  /*
3293   * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3294   */
__add_tree_block(struct reloc_control * rc,u64 bytenr,u32 blocksize,struct rb_root * blocks)3295  static int __add_tree_block(struct reloc_control *rc,
3296  			    u64 bytenr, u32 blocksize,
3297  			    struct rb_root *blocks)
3298  {
3299  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3300  	struct btrfs_path *path;
3301  	struct btrfs_key key;
3302  	int ret;
3303  	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3304  
3305  	if (tree_block_processed(bytenr, rc))
3306  		return 0;
3307  
3308  	if (rb_simple_search(blocks, bytenr))
3309  		return 0;
3310  
3311  	path = btrfs_alloc_path();
3312  	if (!path)
3313  		return -ENOMEM;
3314  again:
3315  	key.objectid = bytenr;
3316  	if (skinny) {
3317  		key.type = BTRFS_METADATA_ITEM_KEY;
3318  		key.offset = (u64)-1;
3319  	} else {
3320  		key.type = BTRFS_EXTENT_ITEM_KEY;
3321  		key.offset = blocksize;
3322  	}
3323  
3324  	path->search_commit_root = 1;
3325  	path->skip_locking = 1;
3326  	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3327  	if (ret < 0)
3328  		goto out;
3329  
3330  	if (ret > 0 && skinny) {
3331  		if (path->slots[0]) {
3332  			path->slots[0]--;
3333  			btrfs_item_key_to_cpu(path->nodes[0], &key,
3334  					      path->slots[0]);
3335  			if (key.objectid == bytenr &&
3336  			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3337  			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3338  			      key.offset == blocksize)))
3339  				ret = 0;
3340  		}
3341  
3342  		if (ret) {
3343  			skinny = false;
3344  			btrfs_release_path(path);
3345  			goto again;
3346  		}
3347  	}
3348  	if (ret) {
3349  		ASSERT(ret == 1);
3350  		btrfs_print_leaf(path->nodes[0]);
3351  		btrfs_err(fs_info,
3352  	     "tree block extent item (%llu) is not found in extent tree",
3353  		     bytenr);
3354  		WARN_ON(1);
3355  		ret = -EINVAL;
3356  		goto out;
3357  	}
3358  
3359  	ret = add_tree_block(rc, &key, path, blocks);
3360  out:
3361  	btrfs_free_path(path);
3362  	return ret;
3363  }
3364  
delete_block_group_cache(struct btrfs_fs_info * fs_info,struct btrfs_block_group * block_group,struct inode * inode,u64 ino)3365  static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3366  				    struct btrfs_block_group *block_group,
3367  				    struct inode *inode,
3368  				    u64 ino)
3369  {
3370  	struct btrfs_root *root = fs_info->tree_root;
3371  	struct btrfs_trans_handle *trans;
3372  	int ret = 0;
3373  
3374  	if (inode)
3375  		goto truncate;
3376  
3377  	inode = btrfs_iget(fs_info->sb, ino, root);
3378  	if (IS_ERR(inode))
3379  		return -ENOENT;
3380  
3381  truncate:
3382  	ret = btrfs_check_trunc_cache_free_space(fs_info,
3383  						 &fs_info->global_block_rsv);
3384  	if (ret)
3385  		goto out;
3386  
3387  	trans = btrfs_join_transaction(root);
3388  	if (IS_ERR(trans)) {
3389  		ret = PTR_ERR(trans);
3390  		goto out;
3391  	}
3392  
3393  	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3394  
3395  	btrfs_end_transaction(trans);
3396  	btrfs_btree_balance_dirty(fs_info);
3397  out:
3398  	iput(inode);
3399  	return ret;
3400  }
3401  
3402  /*
3403   * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3404   * cache inode, to avoid free space cache data extent blocking data relocation.
3405   */
delete_v1_space_cache(struct extent_buffer * leaf,struct btrfs_block_group * block_group,u64 data_bytenr)3406  static int delete_v1_space_cache(struct extent_buffer *leaf,
3407  				 struct btrfs_block_group *block_group,
3408  				 u64 data_bytenr)
3409  {
3410  	u64 space_cache_ino;
3411  	struct btrfs_file_extent_item *ei;
3412  	struct btrfs_key key;
3413  	bool found = false;
3414  	int i;
3415  	int ret;
3416  
3417  	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3418  		return 0;
3419  
3420  	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3421  		u8 type;
3422  
3423  		btrfs_item_key_to_cpu(leaf, &key, i);
3424  		if (key.type != BTRFS_EXTENT_DATA_KEY)
3425  			continue;
3426  		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3427  		type = btrfs_file_extent_type(leaf, ei);
3428  
3429  		if ((type == BTRFS_FILE_EXTENT_REG ||
3430  		     type == BTRFS_FILE_EXTENT_PREALLOC) &&
3431  		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3432  			found = true;
3433  			space_cache_ino = key.objectid;
3434  			break;
3435  		}
3436  	}
3437  	if (!found)
3438  		return -ENOENT;
3439  	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3440  					space_cache_ino);
3441  	return ret;
3442  }
3443  
3444  /*
3445   * helper to find all tree blocks that reference a given data extent
3446   */
3447  static noinline_for_stack
add_data_references(struct reloc_control * rc,struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)3448  int add_data_references(struct reloc_control *rc,
3449  			struct btrfs_key *extent_key,
3450  			struct btrfs_path *path,
3451  			struct rb_root *blocks)
3452  {
3453  	struct btrfs_backref_walk_ctx ctx = { 0 };
3454  	struct ulist_iterator leaf_uiter;
3455  	struct ulist_node *ref_node = NULL;
3456  	const u32 blocksize = rc->extent_root->fs_info->nodesize;
3457  	int ret = 0;
3458  
3459  	btrfs_release_path(path);
3460  
3461  	ctx.bytenr = extent_key->objectid;
3462  	ctx.skip_inode_ref_list = true;
3463  	ctx.fs_info = rc->extent_root->fs_info;
3464  
3465  	ret = btrfs_find_all_leafs(&ctx);
3466  	if (ret < 0)
3467  		return ret;
3468  
3469  	ULIST_ITER_INIT(&leaf_uiter);
3470  	while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
3471  		struct btrfs_tree_parent_check check = { 0 };
3472  		struct extent_buffer *eb;
3473  
3474  		eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
3475  		if (IS_ERR(eb)) {
3476  			ret = PTR_ERR(eb);
3477  			break;
3478  		}
3479  		ret = delete_v1_space_cache(eb, rc->block_group,
3480  					    extent_key->objectid);
3481  		free_extent_buffer(eb);
3482  		if (ret < 0)
3483  			break;
3484  		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3485  		if (ret < 0)
3486  			break;
3487  	}
3488  	if (ret < 0)
3489  		free_block_list(blocks);
3490  	ulist_free(ctx.refs);
3491  	return ret;
3492  }
3493  
3494  /*
3495   * helper to find next unprocessed extent
3496   */
3497  static noinline_for_stack
find_next_extent(struct reloc_control * rc,struct btrfs_path * path,struct btrfs_key * extent_key)3498  int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3499  		     struct btrfs_key *extent_key)
3500  {
3501  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3502  	struct btrfs_key key;
3503  	struct extent_buffer *leaf;
3504  	u64 start, end, last;
3505  	int ret;
3506  
3507  	last = rc->block_group->start + rc->block_group->length;
3508  	while (1) {
3509  		bool block_found;
3510  
3511  		cond_resched();
3512  		if (rc->search_start >= last) {
3513  			ret = 1;
3514  			break;
3515  		}
3516  
3517  		key.objectid = rc->search_start;
3518  		key.type = BTRFS_EXTENT_ITEM_KEY;
3519  		key.offset = 0;
3520  
3521  		path->search_commit_root = 1;
3522  		path->skip_locking = 1;
3523  		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3524  					0, 0);
3525  		if (ret < 0)
3526  			break;
3527  next:
3528  		leaf = path->nodes[0];
3529  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3530  			ret = btrfs_next_leaf(rc->extent_root, path);
3531  			if (ret != 0)
3532  				break;
3533  			leaf = path->nodes[0];
3534  		}
3535  
3536  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3537  		if (key.objectid >= last) {
3538  			ret = 1;
3539  			break;
3540  		}
3541  
3542  		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3543  		    key.type != BTRFS_METADATA_ITEM_KEY) {
3544  			path->slots[0]++;
3545  			goto next;
3546  		}
3547  
3548  		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3549  		    key.objectid + key.offset <= rc->search_start) {
3550  			path->slots[0]++;
3551  			goto next;
3552  		}
3553  
3554  		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3555  		    key.objectid + fs_info->nodesize <=
3556  		    rc->search_start) {
3557  			path->slots[0]++;
3558  			goto next;
3559  		}
3560  
3561  		block_found = find_first_extent_bit(&rc->processed_blocks,
3562  						    key.objectid, &start, &end,
3563  						    EXTENT_DIRTY, NULL);
3564  
3565  		if (block_found && start <= key.objectid) {
3566  			btrfs_release_path(path);
3567  			rc->search_start = end + 1;
3568  		} else {
3569  			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3570  				rc->search_start = key.objectid + key.offset;
3571  			else
3572  				rc->search_start = key.objectid +
3573  					fs_info->nodesize;
3574  			memcpy(extent_key, &key, sizeof(key));
3575  			return 0;
3576  		}
3577  	}
3578  	btrfs_release_path(path);
3579  	return ret;
3580  }
3581  
set_reloc_control(struct reloc_control * rc)3582  static void set_reloc_control(struct reloc_control *rc)
3583  {
3584  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3585  
3586  	mutex_lock(&fs_info->reloc_mutex);
3587  	fs_info->reloc_ctl = rc;
3588  	mutex_unlock(&fs_info->reloc_mutex);
3589  }
3590  
unset_reloc_control(struct reloc_control * rc)3591  static void unset_reloc_control(struct reloc_control *rc)
3592  {
3593  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3594  
3595  	mutex_lock(&fs_info->reloc_mutex);
3596  	fs_info->reloc_ctl = NULL;
3597  	mutex_unlock(&fs_info->reloc_mutex);
3598  }
3599  
3600  static noinline_for_stack
prepare_to_relocate(struct reloc_control * rc)3601  int prepare_to_relocate(struct reloc_control *rc)
3602  {
3603  	struct btrfs_trans_handle *trans;
3604  	int ret;
3605  
3606  	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3607  					      BTRFS_BLOCK_RSV_TEMP);
3608  	if (!rc->block_rsv)
3609  		return -ENOMEM;
3610  
3611  	memset(&rc->cluster, 0, sizeof(rc->cluster));
3612  	rc->search_start = rc->block_group->start;
3613  	rc->extents_found = 0;
3614  	rc->nodes_relocated = 0;
3615  	rc->merging_rsv_size = 0;
3616  	rc->reserved_bytes = 0;
3617  	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3618  			      RELOCATION_RESERVED_NODES;
3619  	ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
3620  				     rc->block_rsv, rc->block_rsv->size,
3621  				     BTRFS_RESERVE_FLUSH_ALL);
3622  	if (ret)
3623  		return ret;
3624  
3625  	rc->create_reloc_tree = 1;
3626  	set_reloc_control(rc);
3627  
3628  	trans = btrfs_join_transaction(rc->extent_root);
3629  	if (IS_ERR(trans)) {
3630  		unset_reloc_control(rc);
3631  		/*
3632  		 * extent tree is not a ref_cow tree and has no reloc_root to
3633  		 * cleanup.  And callers are responsible to free the above
3634  		 * block rsv.
3635  		 */
3636  		return PTR_ERR(trans);
3637  	}
3638  
3639  	ret = btrfs_commit_transaction(trans);
3640  	if (ret)
3641  		unset_reloc_control(rc);
3642  
3643  	return ret;
3644  }
3645  
relocate_block_group(struct reloc_control * rc)3646  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3647  {
3648  	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3649  	struct rb_root blocks = RB_ROOT;
3650  	struct btrfs_key key;
3651  	struct btrfs_trans_handle *trans = NULL;
3652  	struct btrfs_path *path;
3653  	struct btrfs_extent_item *ei;
3654  	u64 flags;
3655  	int ret;
3656  	int err = 0;
3657  	int progress = 0;
3658  
3659  	path = btrfs_alloc_path();
3660  	if (!path)
3661  		return -ENOMEM;
3662  	path->reada = READA_FORWARD;
3663  
3664  	ret = prepare_to_relocate(rc);
3665  	if (ret) {
3666  		err = ret;
3667  		goto out_free;
3668  	}
3669  
3670  	while (1) {
3671  		rc->reserved_bytes = 0;
3672  		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
3673  					     rc->block_rsv->size,
3674  					     BTRFS_RESERVE_FLUSH_ALL);
3675  		if (ret) {
3676  			err = ret;
3677  			break;
3678  		}
3679  		progress++;
3680  		trans = btrfs_start_transaction(rc->extent_root, 0);
3681  		if (IS_ERR(trans)) {
3682  			err = PTR_ERR(trans);
3683  			trans = NULL;
3684  			break;
3685  		}
3686  restart:
3687  		if (update_backref_cache(trans, &rc->backref_cache)) {
3688  			btrfs_end_transaction(trans);
3689  			trans = NULL;
3690  			continue;
3691  		}
3692  
3693  		ret = find_next_extent(rc, path, &key);
3694  		if (ret < 0)
3695  			err = ret;
3696  		if (ret != 0)
3697  			break;
3698  
3699  		rc->extents_found++;
3700  
3701  		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3702  				    struct btrfs_extent_item);
3703  		flags = btrfs_extent_flags(path->nodes[0], ei);
3704  
3705  		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3706  			ret = add_tree_block(rc, &key, path, &blocks);
3707  		} else if (rc->stage == UPDATE_DATA_PTRS &&
3708  			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
3709  			ret = add_data_references(rc, &key, path, &blocks);
3710  		} else {
3711  			btrfs_release_path(path);
3712  			ret = 0;
3713  		}
3714  		if (ret < 0) {
3715  			err = ret;
3716  			break;
3717  		}
3718  
3719  		if (!RB_EMPTY_ROOT(&blocks)) {
3720  			ret = relocate_tree_blocks(trans, rc, &blocks);
3721  			if (ret < 0) {
3722  				if (ret != -EAGAIN) {
3723  					err = ret;
3724  					break;
3725  				}
3726  				rc->extents_found--;
3727  				rc->search_start = key.objectid;
3728  			}
3729  		}
3730  
3731  		btrfs_end_transaction_throttle(trans);
3732  		btrfs_btree_balance_dirty(fs_info);
3733  		trans = NULL;
3734  
3735  		if (rc->stage == MOVE_DATA_EXTENTS &&
3736  		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3737  			rc->found_file_extent = 1;
3738  			ret = relocate_data_extent(rc->data_inode,
3739  						   &key, &rc->cluster);
3740  			if (ret < 0) {
3741  				err = ret;
3742  				break;
3743  			}
3744  		}
3745  		if (btrfs_should_cancel_balance(fs_info)) {
3746  			err = -ECANCELED;
3747  			break;
3748  		}
3749  	}
3750  	if (trans && progress && err == -ENOSPC) {
3751  		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3752  		if (ret == 1) {
3753  			err = 0;
3754  			progress = 0;
3755  			goto restart;
3756  		}
3757  	}
3758  
3759  	btrfs_release_path(path);
3760  	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3761  
3762  	if (trans) {
3763  		btrfs_end_transaction_throttle(trans);
3764  		btrfs_btree_balance_dirty(fs_info);
3765  	}
3766  
3767  	if (!err) {
3768  		ret = relocate_file_extent_cluster(rc->data_inode,
3769  						   &rc->cluster);
3770  		if (ret < 0)
3771  			err = ret;
3772  	}
3773  
3774  	rc->create_reloc_tree = 0;
3775  	set_reloc_control(rc);
3776  
3777  	btrfs_backref_release_cache(&rc->backref_cache);
3778  	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3779  
3780  	/*
3781  	 * Even in the case when the relocation is cancelled, we should all go
3782  	 * through prepare_to_merge() and merge_reloc_roots().
3783  	 *
3784  	 * For error (including cancelled balance), prepare_to_merge() will
3785  	 * mark all reloc trees orphan, then queue them for cleanup in
3786  	 * merge_reloc_roots()
3787  	 */
3788  	err = prepare_to_merge(rc, err);
3789  
3790  	merge_reloc_roots(rc);
3791  
3792  	rc->merge_reloc_tree = 0;
3793  	unset_reloc_control(rc);
3794  	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3795  
3796  	/* get rid of pinned extents */
3797  	trans = btrfs_join_transaction(rc->extent_root);
3798  	if (IS_ERR(trans)) {
3799  		err = PTR_ERR(trans);
3800  		goto out_free;
3801  	}
3802  	ret = btrfs_commit_transaction(trans);
3803  	if (ret && !err)
3804  		err = ret;
3805  out_free:
3806  	ret = clean_dirty_subvols(rc);
3807  	if (ret < 0 && !err)
3808  		err = ret;
3809  	btrfs_free_block_rsv(fs_info, rc->block_rsv);
3810  	btrfs_free_path(path);
3811  	return err;
3812  }
3813  
__insert_orphan_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)3814  static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3815  				 struct btrfs_root *root, u64 objectid)
3816  {
3817  	struct btrfs_path *path;
3818  	struct btrfs_inode_item *item;
3819  	struct extent_buffer *leaf;
3820  	int ret;
3821  
3822  	path = btrfs_alloc_path();
3823  	if (!path)
3824  		return -ENOMEM;
3825  
3826  	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3827  	if (ret)
3828  		goto out;
3829  
3830  	leaf = path->nodes[0];
3831  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3832  	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3833  	btrfs_set_inode_generation(leaf, item, 1);
3834  	btrfs_set_inode_size(leaf, item, 0);
3835  	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3836  	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3837  					  BTRFS_INODE_PREALLOC);
3838  	btrfs_mark_buffer_dirty(leaf);
3839  out:
3840  	btrfs_free_path(path);
3841  	return ret;
3842  }
3843  
delete_orphan_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)3844  static void delete_orphan_inode(struct btrfs_trans_handle *trans,
3845  				struct btrfs_root *root, u64 objectid)
3846  {
3847  	struct btrfs_path *path;
3848  	struct btrfs_key key;
3849  	int ret = 0;
3850  
3851  	path = btrfs_alloc_path();
3852  	if (!path) {
3853  		ret = -ENOMEM;
3854  		goto out;
3855  	}
3856  
3857  	key.objectid = objectid;
3858  	key.type = BTRFS_INODE_ITEM_KEY;
3859  	key.offset = 0;
3860  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3861  	if (ret) {
3862  		if (ret > 0)
3863  			ret = -ENOENT;
3864  		goto out;
3865  	}
3866  	ret = btrfs_del_item(trans, root, path);
3867  out:
3868  	if (ret)
3869  		btrfs_abort_transaction(trans, ret);
3870  	btrfs_free_path(path);
3871  }
3872  
3873  /*
3874   * helper to create inode for data relocation.
3875   * the inode is in data relocation tree and its link count is 0
3876   */
3877  static noinline_for_stack
create_reloc_inode(struct btrfs_fs_info * fs_info,struct btrfs_block_group * group)3878  struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3879  				 struct btrfs_block_group *group)
3880  {
3881  	struct inode *inode = NULL;
3882  	struct btrfs_trans_handle *trans;
3883  	struct btrfs_root *root;
3884  	u64 objectid;
3885  	int err = 0;
3886  
3887  	root = btrfs_grab_root(fs_info->data_reloc_root);
3888  	trans = btrfs_start_transaction(root, 6);
3889  	if (IS_ERR(trans)) {
3890  		btrfs_put_root(root);
3891  		return ERR_CAST(trans);
3892  	}
3893  
3894  	err = btrfs_get_free_objectid(root, &objectid);
3895  	if (err)
3896  		goto out;
3897  
3898  	err = __insert_orphan_inode(trans, root, objectid);
3899  	if (err)
3900  		goto out;
3901  
3902  	inode = btrfs_iget(fs_info->sb, objectid, root);
3903  	if (IS_ERR(inode)) {
3904  		delete_orphan_inode(trans, root, objectid);
3905  		err = PTR_ERR(inode);
3906  		inode = NULL;
3907  		goto out;
3908  	}
3909  	BTRFS_I(inode)->index_cnt = group->start;
3910  
3911  	err = btrfs_orphan_add(trans, BTRFS_I(inode));
3912  out:
3913  	btrfs_put_root(root);
3914  	btrfs_end_transaction(trans);
3915  	btrfs_btree_balance_dirty(fs_info);
3916  	if (err) {
3917  		iput(inode);
3918  		inode = ERR_PTR(err);
3919  	}
3920  	return inode;
3921  }
3922  
3923  /*
3924   * Mark start of chunk relocation that is cancellable. Check if the cancellation
3925   * has been requested meanwhile and don't start in that case.
3926   *
3927   * Return:
3928   *   0             success
3929   *   -EINPROGRESS  operation is already in progress, that's probably a bug
3930   *   -ECANCELED    cancellation request was set before the operation started
3931   */
reloc_chunk_start(struct btrfs_fs_info * fs_info)3932  static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
3933  {
3934  	if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
3935  		/* This should not happen */
3936  		btrfs_err(fs_info, "reloc already running, cannot start");
3937  		return -EINPROGRESS;
3938  	}
3939  
3940  	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
3941  		btrfs_info(fs_info, "chunk relocation canceled on start");
3942  		/*
3943  		 * On cancel, clear all requests but let the caller mark
3944  		 * the end after cleanup operations.
3945  		 */
3946  		atomic_set(&fs_info->reloc_cancel_req, 0);
3947  		return -ECANCELED;
3948  	}
3949  	return 0;
3950  }
3951  
3952  /*
3953   * Mark end of chunk relocation that is cancellable and wake any waiters.
3954   */
reloc_chunk_end(struct btrfs_fs_info * fs_info)3955  static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
3956  {
3957  	/* Requested after start, clear bit first so any waiters can continue */
3958  	if (atomic_read(&fs_info->reloc_cancel_req) > 0)
3959  		btrfs_info(fs_info, "chunk relocation canceled during operation");
3960  	clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
3961  	atomic_set(&fs_info->reloc_cancel_req, 0);
3962  }
3963  
alloc_reloc_control(struct btrfs_fs_info * fs_info)3964  static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3965  {
3966  	struct reloc_control *rc;
3967  
3968  	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3969  	if (!rc)
3970  		return NULL;
3971  
3972  	INIT_LIST_HEAD(&rc->reloc_roots);
3973  	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3974  	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3975  	mapping_tree_init(&rc->reloc_root_tree);
3976  	extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
3977  	return rc;
3978  }
3979  
free_reloc_control(struct reloc_control * rc)3980  static void free_reloc_control(struct reloc_control *rc)
3981  {
3982  	struct mapping_node *node, *tmp;
3983  
3984  	free_reloc_roots(&rc->reloc_roots);
3985  	rbtree_postorder_for_each_entry_safe(node, tmp,
3986  			&rc->reloc_root_tree.rb_root, rb_node)
3987  		kfree(node);
3988  
3989  	kfree(rc);
3990  }
3991  
3992  /*
3993   * Print the block group being relocated
3994   */
describe_relocation(struct btrfs_fs_info * fs_info,struct btrfs_block_group * block_group)3995  static void describe_relocation(struct btrfs_fs_info *fs_info,
3996  				struct btrfs_block_group *block_group)
3997  {
3998  	char buf[128] = {'\0'};
3999  
4000  	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4001  
4002  	btrfs_info(fs_info,
4003  		   "relocating block group %llu flags %s",
4004  		   block_group->start, buf);
4005  }
4006  
stage_to_string(int stage)4007  static const char *stage_to_string(int stage)
4008  {
4009  	if (stage == MOVE_DATA_EXTENTS)
4010  		return "move data extents";
4011  	if (stage == UPDATE_DATA_PTRS)
4012  		return "update data pointers";
4013  	return "unknown";
4014  }
4015  
4016  /*
4017   * function to relocate all extents in a block group.
4018   */
btrfs_relocate_block_group(struct btrfs_fs_info * fs_info,u64 group_start)4019  int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4020  {
4021  	struct btrfs_block_group *bg;
4022  	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
4023  	struct reloc_control *rc;
4024  	struct inode *inode;
4025  	struct btrfs_path *path;
4026  	int ret;
4027  	int rw = 0;
4028  	int err = 0;
4029  
4030  	/*
4031  	 * This only gets set if we had a half-deleted snapshot on mount.  We
4032  	 * cannot allow relocation to start while we're still trying to clean up
4033  	 * these pending deletions.
4034  	 */
4035  	ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
4036  	if (ret)
4037  		return ret;
4038  
4039  	/* We may have been woken up by close_ctree, so bail if we're closing. */
4040  	if (btrfs_fs_closing(fs_info))
4041  		return -EINTR;
4042  
4043  	bg = btrfs_lookup_block_group(fs_info, group_start);
4044  	if (!bg)
4045  		return -ENOENT;
4046  
4047  	/*
4048  	 * Relocation of a data block group creates ordered extents.  Without
4049  	 * sb_start_write(), we can freeze the filesystem while unfinished
4050  	 * ordered extents are left. Such ordered extents can cause a deadlock
4051  	 * e.g. when syncfs() is waiting for their completion but they can't
4052  	 * finish because they block when joining a transaction, due to the
4053  	 * fact that the freeze locks are being held in write mode.
4054  	 */
4055  	if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
4056  		ASSERT(sb_write_started(fs_info->sb));
4057  
4058  	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4059  		btrfs_put_block_group(bg);
4060  		return -ETXTBSY;
4061  	}
4062  
4063  	rc = alloc_reloc_control(fs_info);
4064  	if (!rc) {
4065  		btrfs_put_block_group(bg);
4066  		return -ENOMEM;
4067  	}
4068  
4069  	ret = reloc_chunk_start(fs_info);
4070  	if (ret < 0) {
4071  		err = ret;
4072  		goto out_put_bg;
4073  	}
4074  
4075  	rc->extent_root = extent_root;
4076  	rc->block_group = bg;
4077  
4078  	ret = btrfs_inc_block_group_ro(rc->block_group, true);
4079  	if (ret) {
4080  		err = ret;
4081  		goto out;
4082  	}
4083  	rw = 1;
4084  
4085  	path = btrfs_alloc_path();
4086  	if (!path) {
4087  		err = -ENOMEM;
4088  		goto out;
4089  	}
4090  
4091  	inode = lookup_free_space_inode(rc->block_group, path);
4092  	btrfs_free_path(path);
4093  
4094  	if (!IS_ERR(inode))
4095  		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4096  	else
4097  		ret = PTR_ERR(inode);
4098  
4099  	if (ret && ret != -ENOENT) {
4100  		err = ret;
4101  		goto out;
4102  	}
4103  
4104  	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4105  	if (IS_ERR(rc->data_inode)) {
4106  		err = PTR_ERR(rc->data_inode);
4107  		rc->data_inode = NULL;
4108  		goto out;
4109  	}
4110  
4111  	describe_relocation(fs_info, rc->block_group);
4112  
4113  	btrfs_wait_block_group_reservations(rc->block_group);
4114  	btrfs_wait_nocow_writers(rc->block_group);
4115  	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4116  				 rc->block_group->start,
4117  				 rc->block_group->length);
4118  
4119  	ret = btrfs_zone_finish(rc->block_group);
4120  	WARN_ON(ret && ret != -EAGAIN);
4121  
4122  	while (1) {
4123  		int finishes_stage;
4124  
4125  		mutex_lock(&fs_info->cleaner_mutex);
4126  		ret = relocate_block_group(rc);
4127  		mutex_unlock(&fs_info->cleaner_mutex);
4128  		if (ret < 0)
4129  			err = ret;
4130  
4131  		finishes_stage = rc->stage;
4132  		/*
4133  		 * We may have gotten ENOSPC after we already dirtied some
4134  		 * extents.  If writeout happens while we're relocating a
4135  		 * different block group we could end up hitting the
4136  		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4137  		 * btrfs_reloc_cow_block.  Make sure we write everything out
4138  		 * properly so we don't trip over this problem, and then break
4139  		 * out of the loop if we hit an error.
4140  		 */
4141  		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4142  			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4143  						       (u64)-1);
4144  			if (ret)
4145  				err = ret;
4146  			invalidate_mapping_pages(rc->data_inode->i_mapping,
4147  						 0, -1);
4148  			rc->stage = UPDATE_DATA_PTRS;
4149  		}
4150  
4151  		if (err < 0)
4152  			goto out;
4153  
4154  		if (rc->extents_found == 0)
4155  			break;
4156  
4157  		btrfs_info(fs_info, "found %llu extents, stage: %s",
4158  			   rc->extents_found, stage_to_string(finishes_stage));
4159  	}
4160  
4161  	WARN_ON(rc->block_group->pinned > 0);
4162  	WARN_ON(rc->block_group->reserved > 0);
4163  	WARN_ON(rc->block_group->used > 0);
4164  out:
4165  	if (err && rw)
4166  		btrfs_dec_block_group_ro(rc->block_group);
4167  	iput(rc->data_inode);
4168  out_put_bg:
4169  	btrfs_put_block_group(bg);
4170  	reloc_chunk_end(fs_info);
4171  	free_reloc_control(rc);
4172  	return err;
4173  }
4174  
mark_garbage_root(struct btrfs_root * root)4175  static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4176  {
4177  	struct btrfs_fs_info *fs_info = root->fs_info;
4178  	struct btrfs_trans_handle *trans;
4179  	int ret, err;
4180  
4181  	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4182  	if (IS_ERR(trans))
4183  		return PTR_ERR(trans);
4184  
4185  	memset(&root->root_item.drop_progress, 0,
4186  		sizeof(root->root_item.drop_progress));
4187  	btrfs_set_root_drop_level(&root->root_item, 0);
4188  	btrfs_set_root_refs(&root->root_item, 0);
4189  	ret = btrfs_update_root(trans, fs_info->tree_root,
4190  				&root->root_key, &root->root_item);
4191  
4192  	err = btrfs_end_transaction(trans);
4193  	if (err)
4194  		return err;
4195  	return ret;
4196  }
4197  
4198  /*
4199   * recover relocation interrupted by system crash.
4200   *
4201   * this function resumes merging reloc trees with corresponding fs trees.
4202   * this is important for keeping the sharing of tree blocks
4203   */
btrfs_recover_relocation(struct btrfs_fs_info * fs_info)4204  int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
4205  {
4206  	LIST_HEAD(reloc_roots);
4207  	struct btrfs_key key;
4208  	struct btrfs_root *fs_root;
4209  	struct btrfs_root *reloc_root;
4210  	struct btrfs_path *path;
4211  	struct extent_buffer *leaf;
4212  	struct reloc_control *rc = NULL;
4213  	struct btrfs_trans_handle *trans;
4214  	int ret;
4215  	int err = 0;
4216  
4217  	path = btrfs_alloc_path();
4218  	if (!path)
4219  		return -ENOMEM;
4220  	path->reada = READA_BACK;
4221  
4222  	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4223  	key.type = BTRFS_ROOT_ITEM_KEY;
4224  	key.offset = (u64)-1;
4225  
4226  	while (1) {
4227  		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4228  					path, 0, 0);
4229  		if (ret < 0) {
4230  			err = ret;
4231  			goto out;
4232  		}
4233  		if (ret > 0) {
4234  			if (path->slots[0] == 0)
4235  				break;
4236  			path->slots[0]--;
4237  		}
4238  		leaf = path->nodes[0];
4239  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4240  		btrfs_release_path(path);
4241  
4242  		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4243  		    key.type != BTRFS_ROOT_ITEM_KEY)
4244  			break;
4245  
4246  		reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
4247  		if (IS_ERR(reloc_root)) {
4248  			err = PTR_ERR(reloc_root);
4249  			goto out;
4250  		}
4251  
4252  		set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
4253  		list_add(&reloc_root->root_list, &reloc_roots);
4254  
4255  		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4256  			fs_root = btrfs_get_fs_root(fs_info,
4257  					reloc_root->root_key.offset, false);
4258  			if (IS_ERR(fs_root)) {
4259  				ret = PTR_ERR(fs_root);
4260  				if (ret != -ENOENT) {
4261  					err = ret;
4262  					goto out;
4263  				}
4264  				ret = mark_garbage_root(reloc_root);
4265  				if (ret < 0) {
4266  					err = ret;
4267  					goto out;
4268  				}
4269  			} else {
4270  				btrfs_put_root(fs_root);
4271  			}
4272  		}
4273  
4274  		if (key.offset == 0)
4275  			break;
4276  
4277  		key.offset--;
4278  	}
4279  	btrfs_release_path(path);
4280  
4281  	if (list_empty(&reloc_roots))
4282  		goto out;
4283  
4284  	rc = alloc_reloc_control(fs_info);
4285  	if (!rc) {
4286  		err = -ENOMEM;
4287  		goto out;
4288  	}
4289  
4290  	ret = reloc_chunk_start(fs_info);
4291  	if (ret < 0) {
4292  		err = ret;
4293  		goto out_end;
4294  	}
4295  
4296  	rc->extent_root = btrfs_extent_root(fs_info, 0);
4297  
4298  	set_reloc_control(rc);
4299  
4300  	trans = btrfs_join_transaction(rc->extent_root);
4301  	if (IS_ERR(trans)) {
4302  		err = PTR_ERR(trans);
4303  		goto out_unset;
4304  	}
4305  
4306  	rc->merge_reloc_tree = 1;
4307  
4308  	while (!list_empty(&reloc_roots)) {
4309  		reloc_root = list_entry(reloc_roots.next,
4310  					struct btrfs_root, root_list);
4311  		list_del(&reloc_root->root_list);
4312  
4313  		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4314  			list_add_tail(&reloc_root->root_list,
4315  				      &rc->reloc_roots);
4316  			continue;
4317  		}
4318  
4319  		fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
4320  					    false);
4321  		if (IS_ERR(fs_root)) {
4322  			err = PTR_ERR(fs_root);
4323  			list_add_tail(&reloc_root->root_list, &reloc_roots);
4324  			btrfs_end_transaction(trans);
4325  			goto out_unset;
4326  		}
4327  
4328  		err = __add_reloc_root(reloc_root);
4329  		ASSERT(err != -EEXIST);
4330  		if (err) {
4331  			list_add_tail(&reloc_root->root_list, &reloc_roots);
4332  			btrfs_put_root(fs_root);
4333  			btrfs_end_transaction(trans);
4334  			goto out_unset;
4335  		}
4336  		fs_root->reloc_root = btrfs_grab_root(reloc_root);
4337  		btrfs_put_root(fs_root);
4338  	}
4339  
4340  	err = btrfs_commit_transaction(trans);
4341  	if (err)
4342  		goto out_unset;
4343  
4344  	merge_reloc_roots(rc);
4345  
4346  	unset_reloc_control(rc);
4347  
4348  	trans = btrfs_join_transaction(rc->extent_root);
4349  	if (IS_ERR(trans)) {
4350  		err = PTR_ERR(trans);
4351  		goto out_clean;
4352  	}
4353  	err = btrfs_commit_transaction(trans);
4354  out_clean:
4355  	ret = clean_dirty_subvols(rc);
4356  	if (ret < 0 && !err)
4357  		err = ret;
4358  out_unset:
4359  	unset_reloc_control(rc);
4360  out_end:
4361  	reloc_chunk_end(fs_info);
4362  	free_reloc_control(rc);
4363  out:
4364  	free_reloc_roots(&reloc_roots);
4365  
4366  	btrfs_free_path(path);
4367  
4368  	if (err == 0) {
4369  		/* cleanup orphan inode in data relocation tree */
4370  		fs_root = btrfs_grab_root(fs_info->data_reloc_root);
4371  		ASSERT(fs_root);
4372  		err = btrfs_orphan_cleanup(fs_root);
4373  		btrfs_put_root(fs_root);
4374  	}
4375  	return err;
4376  }
4377  
4378  /*
4379   * helper to add ordered checksum for data relocation.
4380   *
4381   * cloning checksum properly handles the nodatasum extents.
4382   * it also saves CPU time to re-calculate the checksum.
4383   */
btrfs_reloc_clone_csums(struct btrfs_ordered_extent * ordered)4384  int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
4385  {
4386  	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
4387  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4388  	u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
4389  	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
4390  	LIST_HEAD(list);
4391  	int ret;
4392  
4393  	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4394  				      disk_bytenr + ordered->num_bytes - 1,
4395  				      &list, 0, false);
4396  	if (ret)
4397  		return ret;
4398  
4399  	while (!list_empty(&list)) {
4400  		struct btrfs_ordered_sum *sums =
4401  			list_entry(list.next, struct btrfs_ordered_sum, list);
4402  
4403  		list_del_init(&sums->list);
4404  
4405  		/*
4406  		 * We need to offset the new_bytenr based on where the csum is.
4407  		 * We need to do this because we will read in entire prealloc
4408  		 * extents but we may have written to say the middle of the
4409  		 * prealloc extent, so we need to make sure the csum goes with
4410  		 * the right disk offset.
4411  		 *
4412  		 * We can do this because the data reloc inode refers strictly
4413  		 * to the on disk bytes, so we don't have to worry about
4414  		 * disk_len vs real len like with real inodes since it's all
4415  		 * disk length.
4416  		 */
4417  		sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
4418  		btrfs_add_ordered_sum(ordered, sums);
4419  	}
4420  
4421  	return 0;
4422  }
4423  
btrfs_reloc_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow)4424  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4425  			  struct btrfs_root *root, struct extent_buffer *buf,
4426  			  struct extent_buffer *cow)
4427  {
4428  	struct btrfs_fs_info *fs_info = root->fs_info;
4429  	struct reloc_control *rc;
4430  	struct btrfs_backref_node *node;
4431  	int first_cow = 0;
4432  	int level;
4433  	int ret = 0;
4434  
4435  	rc = fs_info->reloc_ctl;
4436  	if (!rc)
4437  		return 0;
4438  
4439  	BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
4440  
4441  	level = btrfs_header_level(buf);
4442  	if (btrfs_header_generation(buf) <=
4443  	    btrfs_root_last_snapshot(&root->root_item))
4444  		first_cow = 1;
4445  
4446  	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4447  	    rc->create_reloc_tree) {
4448  		WARN_ON(!first_cow && level == 0);
4449  
4450  		node = rc->backref_cache.path[level];
4451  		BUG_ON(node->bytenr != buf->start &&
4452  		       node->new_bytenr != buf->start);
4453  
4454  		btrfs_backref_drop_node_buffer(node);
4455  		atomic_inc(&cow->refs);
4456  		node->eb = cow;
4457  		node->new_bytenr = cow->start;
4458  
4459  		if (!node->pending) {
4460  			list_move_tail(&node->list,
4461  				       &rc->backref_cache.pending[level]);
4462  			node->pending = 1;
4463  		}
4464  
4465  		if (first_cow)
4466  			mark_block_processed(rc, node);
4467  
4468  		if (first_cow && level > 0)
4469  			rc->nodes_relocated += buf->len;
4470  	}
4471  
4472  	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4473  		ret = replace_file_extents(trans, rc, root, cow);
4474  	return ret;
4475  }
4476  
4477  /*
4478   * called before creating snapshot. it calculates metadata reservation
4479   * required for relocating tree blocks in the snapshot
4480   */
btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot * pending,u64 * bytes_to_reserve)4481  void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4482  			      u64 *bytes_to_reserve)
4483  {
4484  	struct btrfs_root *root = pending->root;
4485  	struct reloc_control *rc = root->fs_info->reloc_ctl;
4486  
4487  	if (!rc || !have_reloc_root(root))
4488  		return;
4489  
4490  	if (!rc->merge_reloc_tree)
4491  		return;
4492  
4493  	root = root->reloc_root;
4494  	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4495  	/*
4496  	 * relocation is in the stage of merging trees. the space
4497  	 * used by merging a reloc tree is twice the size of
4498  	 * relocated tree nodes in the worst case. half for cowing
4499  	 * the reloc tree, half for cowing the fs tree. the space
4500  	 * used by cowing the reloc tree will be freed after the
4501  	 * tree is dropped. if we create snapshot, cowing the fs
4502  	 * tree may use more space than it frees. so we need
4503  	 * reserve extra space.
4504  	 */
4505  	*bytes_to_reserve += rc->nodes_relocated;
4506  }
4507  
4508  /*
4509   * called after snapshot is created. migrate block reservation
4510   * and create reloc root for the newly created snapshot
4511   *
4512   * This is similar to btrfs_init_reloc_root(), we come out of here with two
4513   * references held on the reloc_root, one for root->reloc_root and one for
4514   * rc->reloc_roots.
4515   */
btrfs_reloc_post_snapshot(struct btrfs_trans_handle * trans,struct btrfs_pending_snapshot * pending)4516  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4517  			       struct btrfs_pending_snapshot *pending)
4518  {
4519  	struct btrfs_root *root = pending->root;
4520  	struct btrfs_root *reloc_root;
4521  	struct btrfs_root *new_root;
4522  	struct reloc_control *rc = root->fs_info->reloc_ctl;
4523  	int ret;
4524  
4525  	if (!rc || !have_reloc_root(root))
4526  		return 0;
4527  
4528  	rc = root->fs_info->reloc_ctl;
4529  	rc->merging_rsv_size += rc->nodes_relocated;
4530  
4531  	if (rc->merge_reloc_tree) {
4532  		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4533  					      rc->block_rsv,
4534  					      rc->nodes_relocated, true);
4535  		if (ret)
4536  			return ret;
4537  	}
4538  
4539  	new_root = pending->snap;
4540  	reloc_root = create_reloc_root(trans, root->reloc_root,
4541  				       new_root->root_key.objectid);
4542  	if (IS_ERR(reloc_root))
4543  		return PTR_ERR(reloc_root);
4544  
4545  	ret = __add_reloc_root(reloc_root);
4546  	ASSERT(ret != -EEXIST);
4547  	if (ret) {
4548  		/* Pairs with create_reloc_root */
4549  		btrfs_put_root(reloc_root);
4550  		return ret;
4551  	}
4552  	new_root->reloc_root = btrfs_grab_root(reloc_root);
4553  
4554  	if (rc->create_reloc_tree)
4555  		ret = clone_backref_node(trans, rc, root, reloc_root);
4556  	return ret;
4557  }
4558  
4559  /*
4560   * Get the current bytenr for the block group which is being relocated.
4561   *
4562   * Return U64_MAX if no running relocation.
4563   */
btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info * fs_info)4564  u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
4565  {
4566  	u64 logical = U64_MAX;
4567  
4568  	lockdep_assert_held(&fs_info->reloc_mutex);
4569  
4570  	if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
4571  		logical = fs_info->reloc_ctl->block_group->start;
4572  	return logical;
4573  }
4574