1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 #if BITS_PER_LONG == 32
40 #define LBITMASK   (0x55555555UL)
41 #define LBITSKIP55 (0x55555555UL)
42 #define LBITSKIP00 (0x00000000UL)
43 #else
44 #define LBITMASK   (0x5555555555555555UL)
45 #define LBITSKIP55 (0x5555555555555555UL)
46 #define LBITSKIP00 (0x0000000000000000UL)
47 #endif
48 
49 /*
50  * These routines are used by the resource group routines (rgrp.c)
51  * to keep track of block allocation.  Each block is represented by two
52  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
53  *
54  * 0 = Free
55  * 1 = Used (not metadata)
56  * 2 = Unlinked (still in use) inode
57  * 3 = Used (metadata)
58  */
59 
60 struct gfs2_extent {
61 	struct gfs2_rbm rbm;
62 	u32 len;
63 };
64 
65 static const char valid_change[16] = {
66 	        /* current */
67 	/* n */ 0, 1, 1, 1,
68 	/* e */ 1, 0, 0, 0,
69 	/* w */ 0, 0, 0, 1,
70 	        1, 0, 0, 0
71 };
72 
73 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
74 			 const struct gfs2_inode *ip, bool nowrap);
75 
76 
77 /**
78  * gfs2_setbit - Set a bit in the bitmaps
79  * @rbm: The position of the bit to set
80  * @do_clone: Also set the clone bitmap, if it exists
81  * @new_state: the new state of the block
82  *
83  */
84 
gfs2_setbit(const struct gfs2_rbm * rbm,bool do_clone,unsigned char new_state)85 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
86 			       unsigned char new_state)
87 {
88 	unsigned char *byte1, *byte2, *end, cur_state;
89 	struct gfs2_bitmap *bi = rbm_bi(rbm);
90 	unsigned int buflen = bi->bi_bytes;
91 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
92 
93 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
94 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
95 
96 	BUG_ON(byte1 >= end);
97 
98 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
99 
100 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
101 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
102 
103 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
104 			rbm->offset, cur_state, new_state);
105 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
106 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
107 			(unsigned long long)bi->bi_bh->b_blocknr);
108 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
109 			bi->bi_offset, bi->bi_bytes,
110 			(unsigned long long)gfs2_rbm_to_block(rbm));
111 		dump_stack();
112 		gfs2_consist_rgrpd(rbm->rgd);
113 		return;
114 	}
115 	*byte1 ^= (cur_state ^ new_state) << bit;
116 
117 	if (do_clone && bi->bi_clone) {
118 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
119 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
120 		*byte2 ^= (cur_state ^ new_state) << bit;
121 	}
122 }
123 
124 /**
125  * gfs2_testbit - test a bit in the bitmaps
126  * @rbm: The bit to test
127  * @use_clone: If true, test the clone bitmap, not the official bitmap.
128  *
129  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
130  * not the "real" bitmaps, to avoid allocating recently freed blocks.
131  *
132  * Returns: The two bit block state of the requested bit
133  */
134 
gfs2_testbit(const struct gfs2_rbm * rbm,bool use_clone)135 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
136 {
137 	struct gfs2_bitmap *bi = rbm_bi(rbm);
138 	const u8 *buffer;
139 	const u8 *byte;
140 	unsigned int bit;
141 
142 	if (use_clone && bi->bi_clone)
143 		buffer = bi->bi_clone;
144 	else
145 		buffer = bi->bi_bh->b_data;
146 	buffer += bi->bi_offset;
147 	byte = buffer + (rbm->offset / GFS2_NBBY);
148 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
149 
150 	return (*byte >> bit) & GFS2_BIT_MASK;
151 }
152 
153 /**
154  * gfs2_bit_search
155  * @ptr: Pointer to bitmap data
156  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
157  * @state: The state we are searching for
158  *
159  * We xor the bitmap data with a patter which is the bitwise opposite
160  * of what we are looking for, this gives rise to a pattern of ones
161  * wherever there is a match. Since we have two bits per entry, we
162  * take this pattern, shift it down by one place and then and it with
163  * the original. All the even bit positions (0,2,4, etc) then represent
164  * successful matches, so we mask with 0x55555..... to remove the unwanted
165  * odd bit positions.
166  *
167  * This allows searching of a whole u64 at once (32 blocks) with a
168  * single test (on 64 bit arches).
169  */
170 
gfs2_bit_search(const __le64 * ptr,u64 mask,u8 state)171 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
172 {
173 	u64 tmp;
174 	static const u64 search[] = {
175 		[0] = 0xffffffffffffffffULL,
176 		[1] = 0xaaaaaaaaaaaaaaaaULL,
177 		[2] = 0x5555555555555555ULL,
178 		[3] = 0x0000000000000000ULL,
179 	};
180 	tmp = le64_to_cpu(*ptr) ^ search[state];
181 	tmp &= (tmp >> 1);
182 	tmp &= mask;
183 	return tmp;
184 }
185 
186 /**
187  * rs_cmp - multi-block reservation range compare
188  * @blk: absolute file system block number of the new reservation
189  * @len: number of blocks in the new reservation
190  * @rs: existing reservation to compare against
191  *
192  * returns: 1 if the block range is beyond the reach of the reservation
193  *         -1 if the block range is before the start of the reservation
194  *          0 if the block range overlaps with the reservation
195  */
rs_cmp(u64 blk,u32 len,struct gfs2_blkreserv * rs)196 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
197 {
198 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
199 
200 	if (blk >= startblk + rs->rs_free)
201 		return 1;
202 	if (blk + len - 1 < startblk)
203 		return -1;
204 	return 0;
205 }
206 
207 /**
208  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
209  *       a block in a given allocation state.
210  * @buf: the buffer that holds the bitmaps
211  * @len: the length (in bytes) of the buffer
212  * @goal: start search at this block's bit-pair (within @buffer)
213  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
214  *
215  * Scope of @goal and returned block number is only within this bitmap buffer,
216  * not entire rgrp or filesystem.  @buffer will be offset from the actual
217  * beginning of a bitmap block buffer, skipping any header structures, but
218  * headers are always a multiple of 64 bits long so that the buffer is
219  * always aligned to a 64 bit boundary.
220  *
221  * The size of the buffer is in bytes, but is it assumed that it is
222  * always ok to read a complete multiple of 64 bits at the end
223  * of the block in case the end is no aligned to a natural boundary.
224  *
225  * Return: the block number (bitmap buffer scope) that was found
226  */
227 
gfs2_bitfit(const u8 * buf,const unsigned int len,u32 goal,u8 state)228 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
229 		       u32 goal, u8 state)
230 {
231 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
232 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
233 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
234 	u64 tmp;
235 	u64 mask = 0x5555555555555555ULL;
236 	u32 bit;
237 
238 	/* Mask off bits we don't care about at the start of the search */
239 	mask <<= spoint;
240 	tmp = gfs2_bit_search(ptr, mask, state);
241 	ptr++;
242 	while(tmp == 0 && ptr < end) {
243 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
244 		ptr++;
245 	}
246 	/* Mask off any bits which are more than len bytes from the start */
247 	if (ptr == end && (len & (sizeof(u64) - 1)))
248 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
249 	/* Didn't find anything, so return */
250 	if (tmp == 0)
251 		return BFITNOENT;
252 	ptr--;
253 	bit = __ffs64(tmp);
254 	bit /= 2;	/* two bits per entry in the bitmap */
255 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
256 }
257 
258 /**
259  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
260  * @rbm: The rbm with rgd already set correctly
261  * @block: The block number (filesystem relative)
262  *
263  * This sets the bi and offset members of an rbm based on a
264  * resource group and a filesystem relative block number. The
265  * resource group must be set in the rbm on entry, the bi and
266  * offset members will be set by this function.
267  *
268  * Returns: 0 on success, or an error code
269  */
270 
gfs2_rbm_from_block(struct gfs2_rbm * rbm,u64 block)271 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
272 {
273 	if (!rgrp_contains_block(rbm->rgd, block))
274 		return -E2BIG;
275 	rbm->bii = 0;
276 	rbm->offset = block - rbm->rgd->rd_data0;
277 	/* Check if the block is within the first block */
278 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
279 		return 0;
280 
281 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
282 	rbm->offset += (sizeof(struct gfs2_rgrp) -
283 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
284 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
285 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
286 	return 0;
287 }
288 
289 /**
290  * gfs2_rbm_incr - increment an rbm structure
291  * @rbm: The rbm with rgd already set correctly
292  *
293  * This function takes an existing rbm structure and increments it to the next
294  * viable block offset.
295  *
296  * Returns: If incrementing the offset would cause the rbm to go past the
297  *          end of the rgrp, true is returned, otherwise false.
298  *
299  */
300 
gfs2_rbm_incr(struct gfs2_rbm * rbm)301 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
302 {
303 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
304 		rbm->offset++;
305 		return false;
306 	}
307 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
308 		return true;
309 
310 	rbm->offset = 0;
311 	rbm->bii++;
312 	return false;
313 }
314 
315 /**
316  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
317  * @rbm: Position to search (value/result)
318  * @n_unaligned: Number of unaligned blocks to check
319  * @len: Decremented for each block found (terminate on zero)
320  *
321  * Returns: true if a non-free block is encountered
322  */
323 
gfs2_unaligned_extlen(struct gfs2_rbm * rbm,u32 n_unaligned,u32 * len)324 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
325 {
326 	u32 n;
327 	u8 res;
328 
329 	for (n = 0; n < n_unaligned; n++) {
330 		res = gfs2_testbit(rbm, true);
331 		if (res != GFS2_BLKST_FREE)
332 			return true;
333 		(*len)--;
334 		if (*len == 0)
335 			return true;
336 		if (gfs2_rbm_incr(rbm))
337 			return true;
338 	}
339 
340 	return false;
341 }
342 
343 /**
344  * gfs2_free_extlen - Return extent length of free blocks
345  * @rrbm: Starting position
346  * @len: Max length to check
347  *
348  * Starting at the block specified by the rbm, see how many free blocks
349  * there are, not reading more than len blocks ahead. This can be done
350  * using memchr_inv when the blocks are byte aligned, but has to be done
351  * on a block by block basis in case of unaligned blocks. Also this
352  * function can cope with bitmap boundaries (although it must stop on
353  * a resource group boundary)
354  *
355  * Returns: Number of free blocks in the extent
356  */
357 
gfs2_free_extlen(const struct gfs2_rbm * rrbm,u32 len)358 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
359 {
360 	struct gfs2_rbm rbm = *rrbm;
361 	u32 n_unaligned = rbm.offset & 3;
362 	u32 size = len;
363 	u32 bytes;
364 	u32 chunk_size;
365 	u8 *ptr, *start, *end;
366 	u64 block;
367 	struct gfs2_bitmap *bi;
368 
369 	if (n_unaligned &&
370 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
371 		goto out;
372 
373 	n_unaligned = len & 3;
374 	/* Start is now byte aligned */
375 	while (len > 3) {
376 		bi = rbm_bi(&rbm);
377 		start = bi->bi_bh->b_data;
378 		if (bi->bi_clone)
379 			start = bi->bi_clone;
380 		start += bi->bi_offset;
381 		end = start + bi->bi_bytes;
382 		BUG_ON(rbm.offset & 3);
383 		start += (rbm.offset / GFS2_NBBY);
384 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
385 		ptr = memchr_inv(start, 0, bytes);
386 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
387 		chunk_size *= GFS2_NBBY;
388 		BUG_ON(len < chunk_size);
389 		len -= chunk_size;
390 		block = gfs2_rbm_to_block(&rbm);
391 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
392 			n_unaligned = 0;
393 			break;
394 		}
395 		if (ptr) {
396 			n_unaligned = 3;
397 			break;
398 		}
399 		n_unaligned = len & 3;
400 	}
401 
402 	/* Deal with any bits left over at the end */
403 	if (n_unaligned)
404 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
405 out:
406 	return size - len;
407 }
408 
409 /**
410  * gfs2_bitcount - count the number of bits in a certain state
411  * @rgd: the resource group descriptor
412  * @buffer: the buffer that holds the bitmaps
413  * @buflen: the length (in bytes) of the buffer
414  * @state: the state of the block we're looking for
415  *
416  * Returns: The number of bits
417  */
418 
gfs2_bitcount(struct gfs2_rgrpd * rgd,const u8 * buffer,unsigned int buflen,u8 state)419 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
420 			 unsigned int buflen, u8 state)
421 {
422 	const u8 *byte = buffer;
423 	const u8 *end = buffer + buflen;
424 	const u8 state1 = state << 2;
425 	const u8 state2 = state << 4;
426 	const u8 state3 = state << 6;
427 	u32 count = 0;
428 
429 	for (; byte < end; byte++) {
430 		if (((*byte) & 0x03) == state)
431 			count++;
432 		if (((*byte) & 0x0C) == state1)
433 			count++;
434 		if (((*byte) & 0x30) == state2)
435 			count++;
436 		if (((*byte) & 0xC0) == state3)
437 			count++;
438 	}
439 
440 	return count;
441 }
442 
443 /**
444  * gfs2_rgrp_verify - Verify that a resource group is consistent
445  * @rgd: the rgrp
446  *
447  */
448 
gfs2_rgrp_verify(struct gfs2_rgrpd * rgd)449 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
450 {
451 	struct gfs2_sbd *sdp = rgd->rd_sbd;
452 	struct gfs2_bitmap *bi = NULL;
453 	u32 length = rgd->rd_length;
454 	u32 count[4], tmp;
455 	int buf, x;
456 
457 	memset(count, 0, 4 * sizeof(u32));
458 
459 	/* Count # blocks in each of 4 possible allocation states */
460 	for (buf = 0; buf < length; buf++) {
461 		bi = rgd->rd_bits + buf;
462 		for (x = 0; x < 4; x++)
463 			count[x] += gfs2_bitcount(rgd,
464 						  bi->bi_bh->b_data +
465 						  bi->bi_offset,
466 						  bi->bi_bytes, x);
467 	}
468 
469 	if (count[0] != rgd->rd_free) {
470 		if (gfs2_consist_rgrpd(rgd))
471 			fs_err(sdp, "free data mismatch:  %u != %u\n",
472 			       count[0], rgd->rd_free);
473 		return;
474 	}
475 
476 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
477 	if (count[1] != tmp) {
478 		if (gfs2_consist_rgrpd(rgd))
479 			fs_err(sdp, "used data mismatch:  %u != %u\n",
480 			       count[1], tmp);
481 		return;
482 	}
483 
484 	if (count[2] + count[3] != rgd->rd_dinodes) {
485 		if (gfs2_consist_rgrpd(rgd))
486 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
487 			       count[2] + count[3], rgd->rd_dinodes);
488 		return;
489 	}
490 }
491 
492 /**
493  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
494  * @sdp: The GFS2 superblock
495  * @blk: The data block number
496  * @exact: True if this needs to be an exact match
497  *
498  * The @exact argument should be set to true by most callers. The exception
499  * is when we need to match blocks which are not represented by the rgrp
500  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
501  * there for alignment purposes. Another way of looking at it is that @exact
502  * matches only valid data/metadata blocks, but with @exact false, it will
503  * match any block within the extent of the rgrp.
504  *
505  * Returns: The resource group, or NULL if not found
506  */
507 
gfs2_blk2rgrpd(struct gfs2_sbd * sdp,u64 blk,bool exact)508 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
509 {
510 	struct rb_node *n, *next;
511 	struct gfs2_rgrpd *cur;
512 
513 	spin_lock(&sdp->sd_rindex_spin);
514 	n = sdp->sd_rindex_tree.rb_node;
515 	while (n) {
516 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
517 		next = NULL;
518 		if (blk < cur->rd_addr)
519 			next = n->rb_left;
520 		else if (blk >= cur->rd_data0 + cur->rd_data)
521 			next = n->rb_right;
522 		if (next == NULL) {
523 			spin_unlock(&sdp->sd_rindex_spin);
524 			if (exact) {
525 				if (blk < cur->rd_addr)
526 					return NULL;
527 				if (blk >= cur->rd_data0 + cur->rd_data)
528 					return NULL;
529 			}
530 			return cur;
531 		}
532 		n = next;
533 	}
534 	spin_unlock(&sdp->sd_rindex_spin);
535 
536 	return NULL;
537 }
538 
539 /**
540  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
541  * @sdp: The GFS2 superblock
542  *
543  * Returns: The first rgrp in the filesystem
544  */
545 
gfs2_rgrpd_get_first(struct gfs2_sbd * sdp)546 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
547 {
548 	const struct rb_node *n;
549 	struct gfs2_rgrpd *rgd;
550 
551 	spin_lock(&sdp->sd_rindex_spin);
552 	n = rb_first(&sdp->sd_rindex_tree);
553 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
554 	spin_unlock(&sdp->sd_rindex_spin);
555 
556 	return rgd;
557 }
558 
559 /**
560  * gfs2_rgrpd_get_next - get the next RG
561  * @rgd: the resource group descriptor
562  *
563  * Returns: The next rgrp
564  */
565 
gfs2_rgrpd_get_next(struct gfs2_rgrpd * rgd)566 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
567 {
568 	struct gfs2_sbd *sdp = rgd->rd_sbd;
569 	const struct rb_node *n;
570 
571 	spin_lock(&sdp->sd_rindex_spin);
572 	n = rb_next(&rgd->rd_node);
573 	if (n == NULL)
574 		n = rb_first(&sdp->sd_rindex_tree);
575 
576 	if (unlikely(&rgd->rd_node == n)) {
577 		spin_unlock(&sdp->sd_rindex_spin);
578 		return NULL;
579 	}
580 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
581 	spin_unlock(&sdp->sd_rindex_spin);
582 	return rgd;
583 }
584 
check_and_update_goal(struct gfs2_inode * ip)585 void check_and_update_goal(struct gfs2_inode *ip)
586 {
587 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
588 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
589 		ip->i_goal = ip->i_no_addr;
590 }
591 
gfs2_free_clones(struct gfs2_rgrpd * rgd)592 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
593 {
594 	int x;
595 
596 	for (x = 0; x < rgd->rd_length; x++) {
597 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
598 		kfree(bi->bi_clone);
599 		bi->bi_clone = NULL;
600 	}
601 }
602 
603 /**
604  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
605  *                 plus a quota allocations data structure, if necessary
606  * @ip: the inode for this reservation
607  */
gfs2_rsqa_alloc(struct gfs2_inode * ip)608 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
609 {
610 	return gfs2_qa_alloc(ip);
611 }
612 
dump_rs(struct seq_file * seq,const struct gfs2_blkreserv * rs,const char * fs_id_buf)613 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
614 		    const char *fs_id_buf)
615 {
616 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
617 
618 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
619 		       (unsigned long long)ip->i_no_addr,
620 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
621 		       rs->rs_rbm.offset, rs->rs_free);
622 }
623 
624 /**
625  * __rs_deltree - remove a multi-block reservation from the rgd tree
626  * @rs: The reservation to remove
627  *
628  */
__rs_deltree(struct gfs2_blkreserv * rs)629 static void __rs_deltree(struct gfs2_blkreserv *rs)
630 {
631 	struct gfs2_rgrpd *rgd;
632 
633 	if (!gfs2_rs_active(rs))
634 		return;
635 
636 	rgd = rs->rs_rbm.rgd;
637 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
638 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
639 	RB_CLEAR_NODE(&rs->rs_node);
640 
641 	if (rs->rs_free) {
642 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
643 				 rs->rs_free - 1;
644 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
645 		struct gfs2_bitmap *start, *last;
646 
647 		/* return reserved blocks to the rgrp */
648 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
649 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
650 		/* The rgrp extent failure point is likely not to increase;
651 		   it will only do so if the freed blocks are somehow
652 		   contiguous with a span of free blocks that follows. Still,
653 		   it will force the number to be recalculated later. */
654 		rgd->rd_extfail_pt += rs->rs_free;
655 		rs->rs_free = 0;
656 		if (gfs2_rbm_from_block(&last_rbm, last_block))
657 			return;
658 		start = rbm_bi(&rs->rs_rbm);
659 		last = rbm_bi(&last_rbm);
660 		do
661 			clear_bit(GBF_FULL, &start->bi_flags);
662 		while (start++ != last);
663 	}
664 }
665 
666 /**
667  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
668  * @rs: The reservation to remove
669  *
670  */
gfs2_rs_deltree(struct gfs2_blkreserv * rs)671 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
672 {
673 	struct gfs2_rgrpd *rgd;
674 
675 	rgd = rs->rs_rbm.rgd;
676 	if (rgd) {
677 		spin_lock(&rgd->rd_rsspin);
678 		__rs_deltree(rs);
679 		BUG_ON(rs->rs_free);
680 		spin_unlock(&rgd->rd_rsspin);
681 	}
682 }
683 
684 /**
685  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
686  * @ip: The inode for this reservation
687  * @wcount: The inode's write count, or NULL
688  *
689  */
gfs2_rsqa_delete(struct gfs2_inode * ip,atomic_t * wcount)690 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
691 {
692 	down_write(&ip->i_rw_mutex);
693 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
694 		gfs2_rs_deltree(&ip->i_res);
695 	up_write(&ip->i_rw_mutex);
696 	gfs2_qa_delete(ip, wcount);
697 }
698 
699 /**
700  * return_all_reservations - return all reserved blocks back to the rgrp.
701  * @rgd: the rgrp that needs its space back
702  *
703  * We previously reserved a bunch of blocks for allocation. Now we need to
704  * give them back. This leave the reservation structures in tact, but removes
705  * all of their corresponding "no-fly zones".
706  */
return_all_reservations(struct gfs2_rgrpd * rgd)707 static void return_all_reservations(struct gfs2_rgrpd *rgd)
708 {
709 	struct rb_node *n;
710 	struct gfs2_blkreserv *rs;
711 
712 	spin_lock(&rgd->rd_rsspin);
713 	while ((n = rb_first(&rgd->rd_rstree))) {
714 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
715 		__rs_deltree(rs);
716 	}
717 	spin_unlock(&rgd->rd_rsspin);
718 }
719 
gfs2_clear_rgrpd(struct gfs2_sbd * sdp)720 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
721 {
722 	struct rb_node *n;
723 	struct gfs2_rgrpd *rgd;
724 	struct gfs2_glock *gl;
725 
726 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
727 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
728 		gl = rgd->rd_gl;
729 
730 		rb_erase(n, &sdp->sd_rindex_tree);
731 
732 		if (gl) {
733 			glock_clear_object(gl, rgd);
734 			gfs2_rgrp_brelse(rgd);
735 			gfs2_glock_put(gl);
736 		}
737 
738 		gfs2_free_clones(rgd);
739 		kfree(rgd->rd_bits);
740 		rgd->rd_bits = NULL;
741 		return_all_reservations(rgd);
742 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
743 	}
744 }
745 
gfs2_rindex_print(const struct gfs2_rgrpd * rgd)746 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
747 {
748 	struct gfs2_sbd *sdp = rgd->rd_sbd;
749 
750 	fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
751 	fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
752 	fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
753 	fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
754 	fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
755 }
756 
757 /**
758  * gfs2_compute_bitstructs - Compute the bitmap sizes
759  * @rgd: The resource group descriptor
760  *
761  * Calculates bitmap descriptors, one for each block that contains bitmap data
762  *
763  * Returns: errno
764  */
765 
compute_bitstructs(struct gfs2_rgrpd * rgd)766 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
767 {
768 	struct gfs2_sbd *sdp = rgd->rd_sbd;
769 	struct gfs2_bitmap *bi;
770 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
771 	u32 bytes_left, bytes;
772 	int x;
773 
774 	if (!length)
775 		return -EINVAL;
776 
777 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
778 	if (!rgd->rd_bits)
779 		return -ENOMEM;
780 
781 	bytes_left = rgd->rd_bitbytes;
782 
783 	for (x = 0; x < length; x++) {
784 		bi = rgd->rd_bits + x;
785 
786 		bi->bi_flags = 0;
787 		/* small rgrp; bitmap stored completely in header block */
788 		if (length == 1) {
789 			bytes = bytes_left;
790 			bi->bi_offset = sizeof(struct gfs2_rgrp);
791 			bi->bi_start = 0;
792 			bi->bi_bytes = bytes;
793 			bi->bi_blocks = bytes * GFS2_NBBY;
794 		/* header block */
795 		} else if (x == 0) {
796 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
797 			bi->bi_offset = sizeof(struct gfs2_rgrp);
798 			bi->bi_start = 0;
799 			bi->bi_bytes = bytes;
800 			bi->bi_blocks = bytes * GFS2_NBBY;
801 		/* last block */
802 		} else if (x + 1 == length) {
803 			bytes = bytes_left;
804 			bi->bi_offset = sizeof(struct gfs2_meta_header);
805 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
806 			bi->bi_bytes = bytes;
807 			bi->bi_blocks = bytes * GFS2_NBBY;
808 		/* other blocks */
809 		} else {
810 			bytes = sdp->sd_sb.sb_bsize -
811 				sizeof(struct gfs2_meta_header);
812 			bi->bi_offset = sizeof(struct gfs2_meta_header);
813 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
814 			bi->bi_bytes = bytes;
815 			bi->bi_blocks = bytes * GFS2_NBBY;
816 		}
817 
818 		bytes_left -= bytes;
819 	}
820 
821 	if (bytes_left) {
822 		gfs2_consist_rgrpd(rgd);
823 		return -EIO;
824 	}
825 	bi = rgd->rd_bits + (length - 1);
826 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
827 		if (gfs2_consist_rgrpd(rgd)) {
828 			gfs2_rindex_print(rgd);
829 			fs_err(sdp, "start=%u len=%u offset=%u\n",
830 			       bi->bi_start, bi->bi_bytes, bi->bi_offset);
831 		}
832 		return -EIO;
833 	}
834 
835 	return 0;
836 }
837 
838 /**
839  * gfs2_ri_total - Total up the file system space, according to the rindex.
840  * @sdp: the filesystem
841  *
842  */
gfs2_ri_total(struct gfs2_sbd * sdp)843 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
844 {
845 	u64 total_data = 0;
846 	struct inode *inode = sdp->sd_rindex;
847 	struct gfs2_inode *ip = GFS2_I(inode);
848 	char buf[sizeof(struct gfs2_rindex)];
849 	int error, rgrps;
850 
851 	for (rgrps = 0;; rgrps++) {
852 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
853 
854 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
855 			break;
856 		error = gfs2_internal_read(ip, buf, &pos,
857 					   sizeof(struct gfs2_rindex));
858 		if (error != sizeof(struct gfs2_rindex))
859 			break;
860 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
861 	}
862 	return total_data;
863 }
864 
rgd_insert(struct gfs2_rgrpd * rgd)865 static int rgd_insert(struct gfs2_rgrpd *rgd)
866 {
867 	struct gfs2_sbd *sdp = rgd->rd_sbd;
868 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
869 
870 	/* Figure out where to put new node */
871 	while (*newn) {
872 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
873 						  rd_node);
874 
875 		parent = *newn;
876 		if (rgd->rd_addr < cur->rd_addr)
877 			newn = &((*newn)->rb_left);
878 		else if (rgd->rd_addr > cur->rd_addr)
879 			newn = &((*newn)->rb_right);
880 		else
881 			return -EEXIST;
882 	}
883 
884 	rb_link_node(&rgd->rd_node, parent, newn);
885 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
886 	sdp->sd_rgrps++;
887 	return 0;
888 }
889 
890 /**
891  * read_rindex_entry - Pull in a new resource index entry from the disk
892  * @ip: Pointer to the rindex inode
893  *
894  * Returns: 0 on success, > 0 on EOF, error code otherwise
895  */
896 
read_rindex_entry(struct gfs2_inode * ip)897 static int read_rindex_entry(struct gfs2_inode *ip)
898 {
899 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 	const unsigned bsize = sdp->sd_sb.sb_bsize;
901 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
902 	struct gfs2_rindex buf;
903 	int error;
904 	struct gfs2_rgrpd *rgd;
905 
906 	if (pos >= i_size_read(&ip->i_inode))
907 		return 1;
908 
909 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
910 				   sizeof(struct gfs2_rindex));
911 
912 	if (error != sizeof(struct gfs2_rindex))
913 		return (error == 0) ? 1 : error;
914 
915 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
916 	error = -ENOMEM;
917 	if (!rgd)
918 		return error;
919 
920 	rgd->rd_sbd = sdp;
921 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
922 	rgd->rd_length = be32_to_cpu(buf.ri_length);
923 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
924 	rgd->rd_data = be32_to_cpu(buf.ri_data);
925 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
926 	spin_lock_init(&rgd->rd_rsspin);
927 
928 	error = compute_bitstructs(rgd);
929 	if (error)
930 		goto fail;
931 
932 	error = gfs2_glock_get(sdp, rgd->rd_addr,
933 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
934 	if (error)
935 		goto fail;
936 
937 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
938 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
939 	if (rgd->rd_data > sdp->sd_max_rg_data)
940 		sdp->sd_max_rg_data = rgd->rd_data;
941 	spin_lock(&sdp->sd_rindex_spin);
942 	error = rgd_insert(rgd);
943 	spin_unlock(&sdp->sd_rindex_spin);
944 	if (!error) {
945 		glock_set_object(rgd->rd_gl, rgd);
946 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
947 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
948 						    rgd->rd_length) * bsize) - 1;
949 		return 0;
950 	}
951 
952 	error = 0; /* someone else read in the rgrp; free it and ignore it */
953 	gfs2_glock_put(rgd->rd_gl);
954 
955 fail:
956 	kfree(rgd->rd_bits);
957 	rgd->rd_bits = NULL;
958 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
959 	return error;
960 }
961 
962 /**
963  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
964  * @sdp: the GFS2 superblock
965  *
966  * The purpose of this function is to select a subset of the resource groups
967  * and mark them as PREFERRED. We do it in such a way that each node prefers
968  * to use a unique set of rgrps to minimize glock contention.
969  */
set_rgrp_preferences(struct gfs2_sbd * sdp)970 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
971 {
972 	struct gfs2_rgrpd *rgd, *first;
973 	int i;
974 
975 	/* Skip an initial number of rgrps, based on this node's journal ID.
976 	   That should start each node out on its own set. */
977 	rgd = gfs2_rgrpd_get_first(sdp);
978 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
979 		rgd = gfs2_rgrpd_get_next(rgd);
980 	first = rgd;
981 
982 	do {
983 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
984 		for (i = 0; i < sdp->sd_journals; i++) {
985 			rgd = gfs2_rgrpd_get_next(rgd);
986 			if (!rgd || rgd == first)
987 				break;
988 		}
989 	} while (rgd && rgd != first);
990 }
991 
992 /**
993  * gfs2_ri_update - Pull in a new resource index from the disk
994  * @ip: pointer to the rindex inode
995  *
996  * Returns: 0 on successful update, error code otherwise
997  */
998 
gfs2_ri_update(struct gfs2_inode * ip)999 static int gfs2_ri_update(struct gfs2_inode *ip)
1000 {
1001 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1002 	int error;
1003 
1004 	do {
1005 		error = read_rindex_entry(ip);
1006 	} while (error == 0);
1007 
1008 	if (error < 0)
1009 		return error;
1010 
1011 	set_rgrp_preferences(sdp);
1012 
1013 	sdp->sd_rindex_uptodate = 1;
1014 	return 0;
1015 }
1016 
1017 /**
1018  * gfs2_rindex_update - Update the rindex if required
1019  * @sdp: The GFS2 superblock
1020  *
1021  * We grab a lock on the rindex inode to make sure that it doesn't
1022  * change whilst we are performing an operation. We keep this lock
1023  * for quite long periods of time compared to other locks. This
1024  * doesn't matter, since it is shared and it is very, very rarely
1025  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1026  *
1027  * This makes sure that we're using the latest copy of the resource index
1028  * special file, which might have been updated if someone expanded the
1029  * filesystem (via gfs2_grow utility), which adds new resource groups.
1030  *
1031  * Returns: 0 on succeess, error code otherwise
1032  */
1033 
gfs2_rindex_update(struct gfs2_sbd * sdp)1034 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1035 {
1036 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1037 	struct gfs2_glock *gl = ip->i_gl;
1038 	struct gfs2_holder ri_gh;
1039 	int error = 0;
1040 	int unlock_required = 0;
1041 
1042 	/* Read new copy from disk if we don't have the latest */
1043 	if (!sdp->sd_rindex_uptodate) {
1044 		if (!gfs2_glock_is_locked_by_me(gl)) {
1045 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1046 			if (error)
1047 				return error;
1048 			unlock_required = 1;
1049 		}
1050 		if (!sdp->sd_rindex_uptodate)
1051 			error = gfs2_ri_update(ip);
1052 		if (unlock_required)
1053 			gfs2_glock_dq_uninit(&ri_gh);
1054 	}
1055 
1056 	return error;
1057 }
1058 
gfs2_rgrp_in(struct gfs2_rgrpd * rgd,const void * buf)1059 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1060 {
1061 	const struct gfs2_rgrp *str = buf;
1062 	u32 rg_flags;
1063 
1064 	rg_flags = be32_to_cpu(str->rg_flags);
1065 	rg_flags &= ~GFS2_RDF_MASK;
1066 	rgd->rd_flags &= GFS2_RDF_MASK;
1067 	rgd->rd_flags |= rg_flags;
1068 	rgd->rd_free = be32_to_cpu(str->rg_free);
1069 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1070 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1071 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1072 }
1073 
gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb * rgl,const void * buf)1074 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1075 {
1076 	const struct gfs2_rgrp *str = buf;
1077 
1078 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1079 	rgl->rl_flags = str->rg_flags;
1080 	rgl->rl_free = str->rg_free;
1081 	rgl->rl_dinodes = str->rg_dinodes;
1082 	rgl->rl_igeneration = str->rg_igeneration;
1083 	rgl->__pad = 0UL;
1084 }
1085 
gfs2_rgrp_out(struct gfs2_rgrpd * rgd,void * buf)1086 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1087 {
1088 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1089 	struct gfs2_rgrp *str = buf;
1090 	u32 crc;
1091 
1092 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1093 	str->rg_free = cpu_to_be32(rgd->rd_free);
1094 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1095 	if (next == NULL)
1096 		str->rg_skip = 0;
1097 	else if (next->rd_addr > rgd->rd_addr)
1098 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1099 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1100 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1101 	str->rg_data = cpu_to_be32(rgd->rd_data);
1102 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1103 	str->rg_crc = 0;
1104 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1105 	str->rg_crc = cpu_to_be32(crc);
1106 
1107 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1108 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1109 }
1110 
gfs2_rgrp_lvb_valid(struct gfs2_rgrpd * rgd)1111 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1112 {
1113 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1114 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1115 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1116 	int valid = 1;
1117 
1118 	if (rgl->rl_flags != str->rg_flags) {
1119 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1120 			(unsigned long long)rgd->rd_addr,
1121 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1122 		valid = 0;
1123 	}
1124 	if (rgl->rl_free != str->rg_free) {
1125 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1126 			(unsigned long long)rgd->rd_addr,
1127 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1128 		valid = 0;
1129 	}
1130 	if (rgl->rl_dinodes != str->rg_dinodes) {
1131 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1132 			(unsigned long long)rgd->rd_addr,
1133 			be32_to_cpu(rgl->rl_dinodes),
1134 			be32_to_cpu(str->rg_dinodes));
1135 		valid = 0;
1136 	}
1137 	if (rgl->rl_igeneration != str->rg_igeneration) {
1138 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1139 			(unsigned long long)rgd->rd_addr,
1140 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1141 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1142 		valid = 0;
1143 	}
1144 	return valid;
1145 }
1146 
count_unlinked(struct gfs2_rgrpd * rgd)1147 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1148 {
1149 	struct gfs2_bitmap *bi;
1150 	const u32 length = rgd->rd_length;
1151 	const u8 *buffer = NULL;
1152 	u32 i, goal, count = 0;
1153 
1154 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1155 		goal = 0;
1156 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1157 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1158 		while (goal < bi->bi_blocks) {
1159 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1160 					   GFS2_BLKST_UNLINKED);
1161 			if (goal == BFITNOENT)
1162 				break;
1163 			count++;
1164 			goal++;
1165 		}
1166 	}
1167 
1168 	return count;
1169 }
1170 
1171 
1172 /**
1173  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1174  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1175  *
1176  * Read in all of a Resource Group's header and bitmap blocks.
1177  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1178  *
1179  * Returns: errno
1180  */
1181 
gfs2_rgrp_bh_get(struct gfs2_rgrpd * rgd)1182 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1183 {
1184 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1185 	struct gfs2_glock *gl = rgd->rd_gl;
1186 	unsigned int length = rgd->rd_length;
1187 	struct gfs2_bitmap *bi;
1188 	unsigned int x, y;
1189 	int error;
1190 
1191 	if (rgd->rd_bits[0].bi_bh != NULL)
1192 		return 0;
1193 
1194 	for (x = 0; x < length; x++) {
1195 		bi = rgd->rd_bits + x;
1196 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1197 		if (error)
1198 			goto fail;
1199 	}
1200 
1201 	for (y = length; y--;) {
1202 		bi = rgd->rd_bits + y;
1203 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1204 		if (error)
1205 			goto fail;
1206 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1207 					      GFS2_METATYPE_RG)) {
1208 			error = -EIO;
1209 			goto fail;
1210 		}
1211 	}
1212 
1213 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1214 		for (x = 0; x < length; x++)
1215 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1216 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1217 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1218 		rgd->rd_free_clone = rgd->rd_free;
1219 		/* max out the rgrp allocation failure point */
1220 		rgd->rd_extfail_pt = rgd->rd_free;
1221 	}
1222 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1223 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1224 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1225 				     rgd->rd_bits[0].bi_bh->b_data);
1226 	}
1227 	else if (sdp->sd_args.ar_rgrplvb) {
1228 		if (!gfs2_rgrp_lvb_valid(rgd)){
1229 			gfs2_consist_rgrpd(rgd);
1230 			error = -EIO;
1231 			goto fail;
1232 		}
1233 		if (rgd->rd_rgl->rl_unlinked == 0)
1234 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1235 	}
1236 	return 0;
1237 
1238 fail:
1239 	while (x--) {
1240 		bi = rgd->rd_bits + x;
1241 		brelse(bi->bi_bh);
1242 		bi->bi_bh = NULL;
1243 		gfs2_assert_warn(sdp, !bi->bi_clone);
1244 	}
1245 
1246 	return error;
1247 }
1248 
update_rgrp_lvb(struct gfs2_rgrpd * rgd)1249 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1250 {
1251 	u32 rl_flags;
1252 
1253 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1254 		return 0;
1255 
1256 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1257 		return gfs2_rgrp_bh_get(rgd);
1258 
1259 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1260 	rl_flags &= ~GFS2_RDF_MASK;
1261 	rgd->rd_flags &= GFS2_RDF_MASK;
1262 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1263 	if (rgd->rd_rgl->rl_unlinked == 0)
1264 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1265 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1266 	rgd->rd_free_clone = rgd->rd_free;
1267 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1268 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1269 	return 0;
1270 }
1271 
gfs2_rgrp_go_lock(struct gfs2_holder * gh)1272 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1273 {
1274 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1275 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1276 
1277 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1278 		return 0;
1279 	return gfs2_rgrp_bh_get(rgd);
1280 }
1281 
1282 /**
1283  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1284  * @rgd: The resource group
1285  *
1286  */
1287 
gfs2_rgrp_brelse(struct gfs2_rgrpd * rgd)1288 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1289 {
1290 	int x, length = rgd->rd_length;
1291 
1292 	for (x = 0; x < length; x++) {
1293 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1294 		if (bi->bi_bh) {
1295 			brelse(bi->bi_bh);
1296 			bi->bi_bh = NULL;
1297 		}
1298 	}
1299 
1300 }
1301 
1302 /**
1303  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1304  * @gh: The glock holder for the resource group
1305  *
1306  */
1307 
gfs2_rgrp_go_unlock(struct gfs2_holder * gh)1308 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1309 {
1310 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1311 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1312 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1313 
1314 	if (rgd && demote_requested)
1315 		gfs2_rgrp_brelse(rgd);
1316 }
1317 
gfs2_rgrp_send_discards(struct gfs2_sbd * sdp,u64 offset,struct buffer_head * bh,const struct gfs2_bitmap * bi,unsigned minlen,u64 * ptrimmed)1318 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1319 			     struct buffer_head *bh,
1320 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1321 {
1322 	struct super_block *sb = sdp->sd_vfs;
1323 	u64 blk;
1324 	sector_t start = 0;
1325 	sector_t nr_blks = 0;
1326 	int rv;
1327 	unsigned int x;
1328 	u32 trimmed = 0;
1329 	u8 diff;
1330 
1331 	for (x = 0; x < bi->bi_bytes; x++) {
1332 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1333 		clone += bi->bi_offset;
1334 		clone += x;
1335 		if (bh) {
1336 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1337 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1338 		} else {
1339 			diff = ~(*clone | (*clone >> 1));
1340 		}
1341 		diff &= 0x55;
1342 		if (diff == 0)
1343 			continue;
1344 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1345 		while(diff) {
1346 			if (diff & 1) {
1347 				if (nr_blks == 0)
1348 					goto start_new_extent;
1349 				if ((start + nr_blks) != blk) {
1350 					if (nr_blks >= minlen) {
1351 						rv = sb_issue_discard(sb,
1352 							start, nr_blks,
1353 							GFP_NOFS, 0);
1354 						if (rv)
1355 							goto fail;
1356 						trimmed += nr_blks;
1357 					}
1358 					nr_blks = 0;
1359 start_new_extent:
1360 					start = blk;
1361 				}
1362 				nr_blks++;
1363 			}
1364 			diff >>= 2;
1365 			blk++;
1366 		}
1367 	}
1368 	if (nr_blks >= minlen) {
1369 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1370 		if (rv)
1371 			goto fail;
1372 		trimmed += nr_blks;
1373 	}
1374 	if (ptrimmed)
1375 		*ptrimmed = trimmed;
1376 	return 0;
1377 
1378 fail:
1379 	if (sdp->sd_args.ar_discard)
1380 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1381 	sdp->sd_args.ar_discard = 0;
1382 	return -EIO;
1383 }
1384 
1385 /**
1386  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1387  * @filp: Any file on the filesystem
1388  * @argp: Pointer to the arguments (also used to pass result)
1389  *
1390  * Returns: 0 on success, otherwise error code
1391  */
1392 
gfs2_fitrim(struct file * filp,void __user * argp)1393 int gfs2_fitrim(struct file *filp, void __user *argp)
1394 {
1395 	struct inode *inode = file_inode(filp);
1396 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1397 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1398 	struct buffer_head *bh;
1399 	struct gfs2_rgrpd *rgd;
1400 	struct gfs2_rgrpd *rgd_end;
1401 	struct gfs2_holder gh;
1402 	struct fstrim_range r;
1403 	int ret = 0;
1404 	u64 amt;
1405 	u64 trimmed = 0;
1406 	u64 start, end, minlen;
1407 	unsigned int x;
1408 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1409 
1410 	if (!capable(CAP_SYS_ADMIN))
1411 		return -EPERM;
1412 
1413 	if (!blk_queue_discard(q))
1414 		return -EOPNOTSUPP;
1415 
1416 	if (copy_from_user(&r, argp, sizeof(r)))
1417 		return -EFAULT;
1418 
1419 	ret = gfs2_rindex_update(sdp);
1420 	if (ret)
1421 		return ret;
1422 
1423 	start = r.start >> bs_shift;
1424 	end = start + (r.len >> bs_shift);
1425 	minlen = max_t(u64, r.minlen,
1426 		       q->limits.discard_granularity) >> bs_shift;
1427 
1428 	if (end <= start || minlen > sdp->sd_max_rg_data)
1429 		return -EINVAL;
1430 
1431 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1432 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1433 
1434 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1435 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1436 		return -EINVAL; /* start is beyond the end of the fs */
1437 
1438 	while (1) {
1439 
1440 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1441 		if (ret)
1442 			goto out;
1443 
1444 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1445 			/* Trim each bitmap in the rgrp */
1446 			for (x = 0; x < rgd->rd_length; x++) {
1447 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1448 				ret = gfs2_rgrp_send_discards(sdp,
1449 						rgd->rd_data0, NULL, bi, minlen,
1450 						&amt);
1451 				if (ret) {
1452 					gfs2_glock_dq_uninit(&gh);
1453 					goto out;
1454 				}
1455 				trimmed += amt;
1456 			}
1457 
1458 			/* Mark rgrp as having been trimmed */
1459 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1460 			if (ret == 0) {
1461 				bh = rgd->rd_bits[0].bi_bh;
1462 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1463 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1464 				gfs2_rgrp_out(rgd, bh->b_data);
1465 				gfs2_trans_end(sdp);
1466 			}
1467 		}
1468 		gfs2_glock_dq_uninit(&gh);
1469 
1470 		if (rgd == rgd_end)
1471 			break;
1472 
1473 		rgd = gfs2_rgrpd_get_next(rgd);
1474 	}
1475 
1476 out:
1477 	r.len = trimmed << bs_shift;
1478 	if (copy_to_user(argp, &r, sizeof(r)))
1479 		return -EFAULT;
1480 
1481 	return ret;
1482 }
1483 
1484 /**
1485  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1486  * @ip: the inode structure
1487  *
1488  */
rs_insert(struct gfs2_inode * ip)1489 static void rs_insert(struct gfs2_inode *ip)
1490 {
1491 	struct rb_node **newn, *parent = NULL;
1492 	int rc;
1493 	struct gfs2_blkreserv *rs = &ip->i_res;
1494 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1495 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1496 
1497 	BUG_ON(gfs2_rs_active(rs));
1498 
1499 	spin_lock(&rgd->rd_rsspin);
1500 	newn = &rgd->rd_rstree.rb_node;
1501 	while (*newn) {
1502 		struct gfs2_blkreserv *cur =
1503 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1504 
1505 		parent = *newn;
1506 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1507 		if (rc > 0)
1508 			newn = &((*newn)->rb_right);
1509 		else if (rc < 0)
1510 			newn = &((*newn)->rb_left);
1511 		else {
1512 			spin_unlock(&rgd->rd_rsspin);
1513 			WARN_ON(1);
1514 			return;
1515 		}
1516 	}
1517 
1518 	rb_link_node(&rs->rs_node, parent, newn);
1519 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1520 
1521 	/* Do our rgrp accounting for the reservation */
1522 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1523 	spin_unlock(&rgd->rd_rsspin);
1524 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1525 }
1526 
1527 /**
1528  * rgd_free - return the number of free blocks we can allocate.
1529  * @rgd: the resource group
1530  *
1531  * This function returns the number of free blocks for an rgrp.
1532  * That's the clone-free blocks (blocks that are free, not including those
1533  * still being used for unlinked files that haven't been deleted.)
1534  *
1535  * It also subtracts any blocks reserved by someone else, but does not
1536  * include free blocks that are still part of our current reservation,
1537  * because obviously we can (and will) allocate them.
1538  */
rgd_free(struct gfs2_rgrpd * rgd,struct gfs2_blkreserv * rs)1539 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1540 {
1541 	u32 tot_reserved, tot_free;
1542 
1543 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1544 		return 0;
1545 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1546 
1547 	if (rgd->rd_free_clone < tot_reserved)
1548 		tot_reserved = 0;
1549 
1550 	tot_free = rgd->rd_free_clone - tot_reserved;
1551 
1552 	return tot_free;
1553 }
1554 
1555 /**
1556  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1557  * @rgd: the resource group descriptor
1558  * @ip: pointer to the inode for which we're reserving blocks
1559  * @ap: the allocation parameters
1560  *
1561  */
1562 
rg_mblk_search(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip,const struct gfs2_alloc_parms * ap)1563 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1564 			   const struct gfs2_alloc_parms *ap)
1565 {
1566 	struct gfs2_rbm rbm = { .rgd = rgd, };
1567 	u64 goal;
1568 	struct gfs2_blkreserv *rs = &ip->i_res;
1569 	u32 extlen;
1570 	u32 free_blocks = rgd_free(rgd, rs);
1571 	int ret;
1572 	struct inode *inode = &ip->i_inode;
1573 
1574 	if (S_ISDIR(inode->i_mode))
1575 		extlen = 1;
1576 	else {
1577 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1578 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1579 	}
1580 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1581 		return;
1582 
1583 	/* Find bitmap block that contains bits for goal block */
1584 	if (rgrp_contains_block(rgd, ip->i_goal))
1585 		goal = ip->i_goal;
1586 	else
1587 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1588 
1589 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1590 		return;
1591 
1592 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1593 	if (ret == 0) {
1594 		rs->rs_rbm = rbm;
1595 		rs->rs_free = extlen;
1596 		rs_insert(ip);
1597 	} else {
1598 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1599 			rgd->rd_last_alloc = 0;
1600 	}
1601 }
1602 
1603 /**
1604  * gfs2_next_unreserved_block - Return next block that is not reserved
1605  * @rgd: The resource group
1606  * @block: The starting block
1607  * @length: The required length
1608  * @ip: Ignore any reservations for this inode
1609  *
1610  * If the block does not appear in any reservation, then return the
1611  * block number unchanged. If it does appear in the reservation, then
1612  * keep looking through the tree of reservations in order to find the
1613  * first block number which is not reserved.
1614  */
1615 
gfs2_next_unreserved_block(struct gfs2_rgrpd * rgd,u64 block,u32 length,const struct gfs2_inode * ip)1616 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1617 				      u32 length,
1618 				      const struct gfs2_inode *ip)
1619 {
1620 	struct gfs2_blkreserv *rs;
1621 	struct rb_node *n;
1622 	int rc;
1623 
1624 	spin_lock(&rgd->rd_rsspin);
1625 	n = rgd->rd_rstree.rb_node;
1626 	while (n) {
1627 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1628 		rc = rs_cmp(block, length, rs);
1629 		if (rc < 0)
1630 			n = n->rb_left;
1631 		else if (rc > 0)
1632 			n = n->rb_right;
1633 		else
1634 			break;
1635 	}
1636 
1637 	if (n) {
1638 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1639 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1640 			n = n->rb_right;
1641 			if (n == NULL)
1642 				break;
1643 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1644 		}
1645 	}
1646 
1647 	spin_unlock(&rgd->rd_rsspin);
1648 	return block;
1649 }
1650 
1651 /**
1652  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1653  * @rbm: The current position in the resource group
1654  * @ip: The inode for which we are searching for blocks
1655  * @minext: The minimum extent length
1656  * @maxext: A pointer to the maximum extent structure
1657  *
1658  * This checks the current position in the rgrp to see whether there is
1659  * a reservation covering this block. If not then this function is a
1660  * no-op. If there is, then the position is moved to the end of the
1661  * contiguous reservation(s) so that we are pointing at the first
1662  * non-reserved block.
1663  *
1664  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1665  */
1666 
gfs2_reservation_check_and_update(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,u32 minext,struct gfs2_extent * maxext)1667 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1668 					     const struct gfs2_inode *ip,
1669 					     u32 minext,
1670 					     struct gfs2_extent *maxext)
1671 {
1672 	u64 block = gfs2_rbm_to_block(rbm);
1673 	u32 extlen = 1;
1674 	u64 nblock;
1675 	int ret;
1676 
1677 	/*
1678 	 * If we have a minimum extent length, then skip over any extent
1679 	 * which is less than the min extent length in size.
1680 	 */
1681 	if (minext) {
1682 		extlen = gfs2_free_extlen(rbm, minext);
1683 		if (extlen <= maxext->len)
1684 			goto fail;
1685 	}
1686 
1687 	/*
1688 	 * Check the extent which has been found against the reservations
1689 	 * and skip if parts of it are already reserved
1690 	 */
1691 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1692 	if (nblock == block) {
1693 		if (!minext || extlen >= minext)
1694 			return 0;
1695 
1696 		if (extlen > maxext->len) {
1697 			maxext->len = extlen;
1698 			maxext->rbm = *rbm;
1699 		}
1700 fail:
1701 		nblock = block + extlen;
1702 	}
1703 	ret = gfs2_rbm_from_block(rbm, nblock);
1704 	if (ret < 0)
1705 		return ret;
1706 	return 1;
1707 }
1708 
1709 /**
1710  * gfs2_rbm_find - Look for blocks of a particular state
1711  * @rbm: Value/result starting position and final position
1712  * @state: The state which we want to find
1713  * @minext: Pointer to the requested extent length (NULL for a single block)
1714  *          This is updated to be the actual reservation size.
1715  * @ip: If set, check for reservations
1716  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1717  *          around until we've reached the starting point.
1718  *
1719  * Side effects:
1720  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1721  *   has no free blocks in it.
1722  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1723  *   has come up short on a free block search.
1724  *
1725  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1726  */
1727 
gfs2_rbm_find(struct gfs2_rbm * rbm,u8 state,u32 * minext,const struct gfs2_inode * ip,bool nowrap)1728 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1729 			 const struct gfs2_inode *ip, bool nowrap)
1730 {
1731 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1732 	struct buffer_head *bh;
1733 	int last_bii;
1734 	u32 offset;
1735 	u8 *buffer;
1736 	bool wrapped = false;
1737 	int ret;
1738 	struct gfs2_bitmap *bi;
1739 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1740 
1741 	/*
1742 	 * Determine the last bitmap to search.  If we're not starting at the
1743 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1744 	 * the entire resource group.
1745 	 */
1746 	last_bii = rbm->bii - (rbm->offset == 0);
1747 
1748 	while(1) {
1749 		bi = rbm_bi(rbm);
1750 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1751 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1752 		    (state == GFS2_BLKST_FREE))
1753 			goto next_bitmap;
1754 
1755 		bh = bi->bi_bh;
1756 		buffer = bh->b_data + bi->bi_offset;
1757 		WARN_ON(!buffer_uptodate(bh));
1758 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1759 			buffer = bi->bi_clone + bi->bi_offset;
1760 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1761 		if (offset == BFITNOENT) {
1762 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1763 				set_bit(GBF_FULL, &bi->bi_flags);
1764 			goto next_bitmap;
1765 		}
1766 		rbm->offset = offset;
1767 		if (ip == NULL)
1768 			return 0;
1769 
1770 		ret = gfs2_reservation_check_and_update(rbm, ip,
1771 							minext ? *minext : 0,
1772 							&maxext);
1773 		if (ret == 0)
1774 			return 0;
1775 		if (ret > 0)
1776 			goto next_iter;
1777 		if (ret == -E2BIG) {
1778 			rbm->bii = 0;
1779 			rbm->offset = 0;
1780 			goto res_covered_end_of_rgrp;
1781 		}
1782 		return ret;
1783 
1784 next_bitmap:	/* Find next bitmap in the rgrp */
1785 		rbm->offset = 0;
1786 		rbm->bii++;
1787 		if (rbm->bii == rbm->rgd->rd_length)
1788 			rbm->bii = 0;
1789 res_covered_end_of_rgrp:
1790 		if (rbm->bii == 0) {
1791 			if (wrapped)
1792 				break;
1793 			wrapped = true;
1794 			if (nowrap)
1795 				break;
1796 		}
1797 next_iter:
1798 		/* Have we scanned the entire resource group? */
1799 		if (wrapped && rbm->bii > last_bii)
1800 			break;
1801 	}
1802 
1803 	if (minext == NULL || state != GFS2_BLKST_FREE)
1804 		return -ENOSPC;
1805 
1806 	/* If the extent was too small, and it's smaller than the smallest
1807 	   to have failed before, remember for future reference that it's
1808 	   useless to search this rgrp again for this amount or more. */
1809 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1810 	    *minext < rbm->rgd->rd_extfail_pt)
1811 		rbm->rgd->rd_extfail_pt = *minext;
1812 
1813 	/* If the maximum extent we found is big enough to fulfill the
1814 	   minimum requirements, use it anyway. */
1815 	if (maxext.len) {
1816 		*rbm = maxext.rbm;
1817 		*minext = maxext.len;
1818 		return 0;
1819 	}
1820 
1821 	return -ENOSPC;
1822 }
1823 
1824 /**
1825  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1826  * @rgd: The rgrp
1827  * @last_unlinked: block address of the last dinode we unlinked
1828  * @skip: block address we should explicitly not unlink
1829  *
1830  * Returns: 0 if no error
1831  *          The inode, if one has been found, in inode.
1832  */
1833 
try_rgrp_unlink(struct gfs2_rgrpd * rgd,u64 * last_unlinked,u64 skip)1834 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1835 {
1836 	u64 block;
1837 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1838 	struct gfs2_glock *gl;
1839 	struct gfs2_inode *ip;
1840 	int error;
1841 	int found = 0;
1842 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1843 
1844 	while (1) {
1845 		down_write(&sdp->sd_log_flush_lock);
1846 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1847 				      true);
1848 		up_write(&sdp->sd_log_flush_lock);
1849 		if (error == -ENOSPC)
1850 			break;
1851 		if (WARN_ON_ONCE(error))
1852 			break;
1853 
1854 		block = gfs2_rbm_to_block(&rbm);
1855 		if (gfs2_rbm_from_block(&rbm, block + 1))
1856 			break;
1857 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1858 			continue;
1859 		if (block == skip)
1860 			continue;
1861 		*last_unlinked = block;
1862 
1863 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1864 		if (error)
1865 			continue;
1866 
1867 		/* If the inode is already in cache, we can ignore it here
1868 		 * because the existing inode disposal code will deal with
1869 		 * it when all refs have gone away. Accessing gl_object like
1870 		 * this is not safe in general. Here it is ok because we do
1871 		 * not dereference the pointer, and we only need an approx
1872 		 * answer to whether it is NULL or not.
1873 		 */
1874 		ip = gl->gl_object;
1875 
1876 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1877 			gfs2_glock_put(gl);
1878 		else
1879 			found++;
1880 
1881 		/* Limit reclaim to sensible number of tasks */
1882 		if (found > NR_CPUS)
1883 			return;
1884 	}
1885 
1886 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1887 	return;
1888 }
1889 
1890 /**
1891  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1892  * @rgd: The rgrp in question
1893  * @loops: An indication of how picky we can be (0=very, 1=less so)
1894  *
1895  * This function uses the recently added glock statistics in order to
1896  * figure out whether a parciular resource group is suffering from
1897  * contention from multiple nodes. This is done purely on the basis
1898  * of timings, since this is the only data we have to work with and
1899  * our aim here is to reject a resource group which is highly contended
1900  * but (very important) not to do this too often in order to ensure that
1901  * we do not land up introducing fragmentation by changing resource
1902  * groups when not actually required.
1903  *
1904  * The calculation is fairly simple, we want to know whether the SRTTB
1905  * (i.e. smoothed round trip time for blocking operations) to acquire
1906  * the lock for this rgrp's glock is significantly greater than the
1907  * time taken for resource groups on average. We introduce a margin in
1908  * the form of the variable @var which is computed as the sum of the two
1909  * respective variences, and multiplied by a factor depending on @loops
1910  * and whether we have a lot of data to base the decision on. This is
1911  * then tested against the square difference of the means in order to
1912  * decide whether the result is statistically significant or not.
1913  *
1914  * Returns: A boolean verdict on the congestion status
1915  */
1916 
gfs2_rgrp_congested(const struct gfs2_rgrpd * rgd,int loops)1917 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1918 {
1919 	const struct gfs2_glock *gl = rgd->rd_gl;
1920 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1921 	struct gfs2_lkstats *st;
1922 	u64 r_dcount, l_dcount;
1923 	u64 l_srttb, a_srttb = 0;
1924 	s64 srttb_diff;
1925 	u64 sqr_diff;
1926 	u64 var;
1927 	int cpu, nonzero = 0;
1928 
1929 	preempt_disable();
1930 	for_each_present_cpu(cpu) {
1931 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1932 		if (st->stats[GFS2_LKS_SRTTB]) {
1933 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1934 			nonzero++;
1935 		}
1936 	}
1937 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1938 	if (nonzero)
1939 		do_div(a_srttb, nonzero);
1940 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1941 	var = st->stats[GFS2_LKS_SRTTVARB] +
1942 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1943 	preempt_enable();
1944 
1945 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1946 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1947 
1948 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1949 		return false;
1950 
1951 	srttb_diff = a_srttb - l_srttb;
1952 	sqr_diff = srttb_diff * srttb_diff;
1953 
1954 	var *= 2;
1955 	if (l_dcount < 8 || r_dcount < 8)
1956 		var *= 2;
1957 	if (loops == 1)
1958 		var *= 2;
1959 
1960 	return ((srttb_diff < 0) && (sqr_diff > var));
1961 }
1962 
1963 /**
1964  * gfs2_rgrp_used_recently
1965  * @rs: The block reservation with the rgrp to test
1966  * @msecs: The time limit in milliseconds
1967  *
1968  * Returns: True if the rgrp glock has been used within the time limit
1969  */
gfs2_rgrp_used_recently(const struct gfs2_blkreserv * rs,u64 msecs)1970 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1971 				    u64 msecs)
1972 {
1973 	u64 tdiff;
1974 
1975 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1976                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1977 
1978 	return tdiff > (msecs * 1000 * 1000);
1979 }
1980 
gfs2_orlov_skip(const struct gfs2_inode * ip)1981 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1982 {
1983 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1984 	u32 skip;
1985 
1986 	get_random_bytes(&skip, sizeof(skip));
1987 	return skip % sdp->sd_rgrps;
1988 }
1989 
gfs2_select_rgrp(struct gfs2_rgrpd ** pos,const struct gfs2_rgrpd * begin)1990 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1991 {
1992 	struct gfs2_rgrpd *rgd = *pos;
1993 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1994 
1995 	rgd = gfs2_rgrpd_get_next(rgd);
1996 	if (rgd == NULL)
1997 		rgd = gfs2_rgrpd_get_first(sdp);
1998 	*pos = rgd;
1999 	if (rgd != begin) /* If we didn't wrap */
2000 		return true;
2001 	return false;
2002 }
2003 
2004 /**
2005  * fast_to_acquire - determine if a resource group will be fast to acquire
2006  *
2007  * If this is one of our preferred rgrps, it should be quicker to acquire,
2008  * because we tried to set ourselves up as dlm lock master.
2009  */
fast_to_acquire(struct gfs2_rgrpd * rgd)2010 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2011 {
2012 	struct gfs2_glock *gl = rgd->rd_gl;
2013 
2014 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2015 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2016 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2017 		return 1;
2018 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2019 		return 1;
2020 	return 0;
2021 }
2022 
2023 /**
2024  * gfs2_inplace_reserve - Reserve space in the filesystem
2025  * @ip: the inode to reserve space for
2026  * @ap: the allocation parameters
2027  *
2028  * We try our best to find an rgrp that has at least ap->target blocks
2029  * available. After a couple of passes (loops == 2), the prospects of finding
2030  * such an rgrp diminish. At this stage, we return the first rgrp that has
2031  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2032  * the number of blocks available in the chosen rgrp.
2033  *
2034  * Returns: 0 on success,
2035  *          -ENOMEM if a suitable rgrp can't be found
2036  *          errno otherwise
2037  */
2038 
gfs2_inplace_reserve(struct gfs2_inode * ip,struct gfs2_alloc_parms * ap)2039 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2040 {
2041 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2042 	struct gfs2_rgrpd *begin = NULL;
2043 	struct gfs2_blkreserv *rs = &ip->i_res;
2044 	int error = 0, rg_locked, flags = 0;
2045 	u64 last_unlinked = NO_BLOCK;
2046 	int loops = 0;
2047 	u32 free_blocks, skip = 0;
2048 
2049 	if (sdp->sd_args.ar_rgrplvb)
2050 		flags |= GL_SKIP;
2051 	if (gfs2_assert_warn(sdp, ap->target))
2052 		return -EINVAL;
2053 	if (gfs2_rs_active(rs)) {
2054 		begin = rs->rs_rbm.rgd;
2055 	} else if (rs->rs_rbm.rgd &&
2056 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2057 		begin = rs->rs_rbm.rgd;
2058 	} else {
2059 		check_and_update_goal(ip);
2060 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2061 	}
2062 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2063 		skip = gfs2_orlov_skip(ip);
2064 	if (rs->rs_rbm.rgd == NULL)
2065 		return -EBADSLT;
2066 
2067 	while (loops < 3) {
2068 		rg_locked = 1;
2069 
2070 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2071 			rg_locked = 0;
2072 			if (skip && skip--)
2073 				goto next_rgrp;
2074 			if (!gfs2_rs_active(rs)) {
2075 				if (loops == 0 &&
2076 				    !fast_to_acquire(rs->rs_rbm.rgd))
2077 					goto next_rgrp;
2078 				if ((loops < 2) &&
2079 				    gfs2_rgrp_used_recently(rs, 1000) &&
2080 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2081 					goto next_rgrp;
2082 			}
2083 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2084 						   LM_ST_EXCLUSIVE, flags,
2085 						   &ip->i_rgd_gh);
2086 			if (unlikely(error))
2087 				return error;
2088 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2089 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2090 				goto skip_rgrp;
2091 			if (sdp->sd_args.ar_rgrplvb) {
2092 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2093 				if (unlikely(error)) {
2094 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2095 					return error;
2096 				}
2097 			}
2098 		}
2099 
2100 		/* Skip unusable resource groups */
2101 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2102 						 GFS2_RDF_ERROR)) ||
2103 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2104 			goto skip_rgrp;
2105 
2106 		if (sdp->sd_args.ar_rgrplvb)
2107 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2108 
2109 		/* Get a reservation if we don't already have one */
2110 		if (!gfs2_rs_active(rs))
2111 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2112 
2113 		/* Skip rgrps when we can't get a reservation on first pass */
2114 		if (!gfs2_rs_active(rs) && (loops < 1))
2115 			goto check_rgrp;
2116 
2117 		/* If rgrp has enough free space, use it */
2118 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2119 		if (free_blocks >= ap->target ||
2120 		    (loops == 2 && ap->min_target &&
2121 		     free_blocks >= ap->min_target)) {
2122 			ap->allowed = free_blocks;
2123 			return 0;
2124 		}
2125 check_rgrp:
2126 		/* Check for unlinked inodes which can be reclaimed */
2127 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2128 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2129 					ip->i_no_addr);
2130 skip_rgrp:
2131 		/* Drop reservation, if we couldn't use reserved rgrp */
2132 		if (gfs2_rs_active(rs))
2133 			gfs2_rs_deltree(rs);
2134 
2135 		/* Unlock rgrp if required */
2136 		if (!rg_locked)
2137 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2138 next_rgrp:
2139 		/* Find the next rgrp, and continue looking */
2140 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2141 			continue;
2142 		if (skip)
2143 			continue;
2144 
2145 		/* If we've scanned all the rgrps, but found no free blocks
2146 		 * then this checks for some less likely conditions before
2147 		 * trying again.
2148 		 */
2149 		loops++;
2150 		/* Check that fs hasn't grown if writing to rindex */
2151 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2152 			error = gfs2_ri_update(ip);
2153 			if (error)
2154 				return error;
2155 		}
2156 		/* Flushing the log may release space */
2157 		if (loops == 2)
2158 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2159 				       GFS2_LFC_INPLACE_RESERVE);
2160 	}
2161 
2162 	return -ENOSPC;
2163 }
2164 
2165 /**
2166  * gfs2_inplace_release - release an inplace reservation
2167  * @ip: the inode the reservation was taken out on
2168  *
2169  * Release a reservation made by gfs2_inplace_reserve().
2170  */
2171 
gfs2_inplace_release(struct gfs2_inode * ip)2172 void gfs2_inplace_release(struct gfs2_inode *ip)
2173 {
2174 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2175 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2176 }
2177 
2178 /**
2179  * gfs2_alloc_extent - allocate an extent from a given bitmap
2180  * @rbm: the resource group information
2181  * @dinode: TRUE if the first block we allocate is for a dinode
2182  * @n: The extent length (value/result)
2183  *
2184  * Add the bitmap buffer to the transaction.
2185  * Set the found bits to @new_state to change block's allocation state.
2186  */
gfs2_alloc_extent(const struct gfs2_rbm * rbm,bool dinode,unsigned int * n)2187 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2188 			     unsigned int *n)
2189 {
2190 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2191 	const unsigned int elen = *n;
2192 	u64 block;
2193 	int ret;
2194 
2195 	*n = 1;
2196 	block = gfs2_rbm_to_block(rbm);
2197 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2198 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2199 	block++;
2200 	while (*n < elen) {
2201 		ret = gfs2_rbm_from_block(&pos, block);
2202 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2203 			break;
2204 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2205 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2206 		(*n)++;
2207 		block++;
2208 	}
2209 }
2210 
2211 /**
2212  * rgblk_free - Change alloc state of given block(s)
2213  * @sdp: the filesystem
2214  * @rgd: the resource group the blocks are in
2215  * @bstart: the start of a run of blocks to free
2216  * @blen: the length of the block run (all must lie within ONE RG!)
2217  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2218  */
2219 
rgblk_free(struct gfs2_sbd * sdp,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,unsigned char new_state)2220 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2221 		       u64 bstart, u32 blen, unsigned char new_state)
2222 {
2223 	struct gfs2_rbm rbm;
2224 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2225 
2226 	rbm.rgd = rgd;
2227 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2228 		return;
2229 	while (blen--) {
2230 		bi = rbm_bi(&rbm);
2231 		if (bi != bi_prev) {
2232 			if (!bi->bi_clone) {
2233 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2234 						      GFP_NOFS | __GFP_NOFAIL);
2235 				memcpy(bi->bi_clone + bi->bi_offset,
2236 				       bi->bi_bh->b_data + bi->bi_offset,
2237 				       bi->bi_bytes);
2238 			}
2239 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2240 			bi_prev = bi;
2241 		}
2242 		gfs2_setbit(&rbm, false, new_state);
2243 		gfs2_rbm_incr(&rbm);
2244 	}
2245 }
2246 
2247 /**
2248  * gfs2_rgrp_dump - print out an rgrp
2249  * @seq: The iterator
2250  * @gl: The glock in question
2251  * @fs_id_buf: pointer to file system id (if requested)
2252  *
2253  */
2254 
gfs2_rgrp_dump(struct seq_file * seq,struct gfs2_glock * gl,const char * fs_id_buf)2255 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
2256 		    const char *fs_id_buf)
2257 {
2258 	struct gfs2_rgrpd *rgd = gl->gl_object;
2259 	struct gfs2_blkreserv *trs;
2260 	const struct rb_node *n;
2261 
2262 	if (rgd == NULL)
2263 		return;
2264 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2265 		       fs_id_buf,
2266 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2267 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2268 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2269 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2270 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2271 
2272 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2273 			       be32_to_cpu(rgl->rl_flags),
2274 			       be32_to_cpu(rgl->rl_free),
2275 			       be32_to_cpu(rgl->rl_dinodes));
2276 	}
2277 	spin_lock(&rgd->rd_rsspin);
2278 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2279 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2280 		dump_rs(seq, trs, fs_id_buf);
2281 	}
2282 	spin_unlock(&rgd->rd_rsspin);
2283 }
2284 
gfs2_rgrp_error(struct gfs2_rgrpd * rgd)2285 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2286 {
2287 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2288 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2289 
2290 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2291 		(unsigned long long)rgd->rd_addr);
2292 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2293 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2294 	gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
2295 	rgd->rd_flags |= GFS2_RDF_ERROR;
2296 }
2297 
2298 /**
2299  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2300  * @ip: The inode we have just allocated blocks for
2301  * @rbm: The start of the allocated blocks
2302  * @len: The extent length
2303  *
2304  * Adjusts a reservation after an allocation has taken place. If the
2305  * reservation does not match the allocation, or if it is now empty
2306  * then it is removed.
2307  */
2308 
gfs2_adjust_reservation(struct gfs2_inode * ip,const struct gfs2_rbm * rbm,unsigned len)2309 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2310 				    const struct gfs2_rbm *rbm, unsigned len)
2311 {
2312 	struct gfs2_blkreserv *rs = &ip->i_res;
2313 	struct gfs2_rgrpd *rgd = rbm->rgd;
2314 	unsigned rlen;
2315 	u64 block;
2316 	int ret;
2317 
2318 	spin_lock(&rgd->rd_rsspin);
2319 	if (gfs2_rs_active(rs)) {
2320 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2321 			block = gfs2_rbm_to_block(rbm);
2322 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2323 			rlen = min(rs->rs_free, len);
2324 			rs->rs_free -= rlen;
2325 			rgd->rd_reserved -= rlen;
2326 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2327 			if (rs->rs_free && !ret)
2328 				goto out;
2329 			/* We used up our block reservation, so we should
2330 			   reserve more blocks next time. */
2331 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2332 		}
2333 		__rs_deltree(rs);
2334 	}
2335 out:
2336 	spin_unlock(&rgd->rd_rsspin);
2337 }
2338 
2339 /**
2340  * gfs2_set_alloc_start - Set starting point for block allocation
2341  * @rbm: The rbm which will be set to the required location
2342  * @ip: The gfs2 inode
2343  * @dinode: Flag to say if allocation includes a new inode
2344  *
2345  * This sets the starting point from the reservation if one is active
2346  * otherwise it falls back to guessing a start point based on the
2347  * inode's goal block or the last allocation point in the rgrp.
2348  */
2349 
gfs2_set_alloc_start(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,bool dinode)2350 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2351 				 const struct gfs2_inode *ip, bool dinode)
2352 {
2353 	u64 goal;
2354 
2355 	if (gfs2_rs_active(&ip->i_res)) {
2356 		*rbm = ip->i_res.rs_rbm;
2357 		return;
2358 	}
2359 
2360 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2361 		goal = ip->i_goal;
2362 	else
2363 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2364 
2365 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2366 		rbm->bii = 0;
2367 		rbm->offset = 0;
2368 	}
2369 }
2370 
2371 /**
2372  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2373  * @ip: the inode to allocate the block for
2374  * @bn: Used to return the starting block number
2375  * @nblocks: requested number of blocks/extent length (value/result)
2376  * @dinode: 1 if we're allocating a dinode block, else 0
2377  * @generation: the generation number of the inode
2378  *
2379  * Returns: 0 or error
2380  */
2381 
gfs2_alloc_blocks(struct gfs2_inode * ip,u64 * bn,unsigned int * nblocks,bool dinode,u64 * generation)2382 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2383 		      bool dinode, u64 *generation)
2384 {
2385 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2386 	struct buffer_head *dibh;
2387 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2388 	unsigned int ndata;
2389 	u64 block; /* block, within the file system scope */
2390 	int error;
2391 
2392 	gfs2_set_alloc_start(&rbm, ip, dinode);
2393 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2394 
2395 	if (error == -ENOSPC) {
2396 		gfs2_set_alloc_start(&rbm, ip, dinode);
2397 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2398 	}
2399 
2400 	/* Since all blocks are reserved in advance, this shouldn't happen */
2401 	if (error) {
2402 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2403 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2404 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2405 			rbm.rgd->rd_extfail_pt);
2406 		goto rgrp_error;
2407 	}
2408 
2409 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2410 	block = gfs2_rbm_to_block(&rbm);
2411 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2412 	if (gfs2_rs_active(&ip->i_res))
2413 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2414 	ndata = *nblocks;
2415 	if (dinode)
2416 		ndata--;
2417 
2418 	if (!dinode) {
2419 		ip->i_goal = block + ndata - 1;
2420 		error = gfs2_meta_inode_buffer(ip, &dibh);
2421 		if (error == 0) {
2422 			struct gfs2_dinode *di =
2423 				(struct gfs2_dinode *)dibh->b_data;
2424 			gfs2_trans_add_meta(ip->i_gl, dibh);
2425 			di->di_goal_meta = di->di_goal_data =
2426 				cpu_to_be64(ip->i_goal);
2427 			brelse(dibh);
2428 		}
2429 	}
2430 	if (rbm.rgd->rd_free < *nblocks) {
2431 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2432 		goto rgrp_error;
2433 	}
2434 
2435 	rbm.rgd->rd_free -= *nblocks;
2436 	if (dinode) {
2437 		rbm.rgd->rd_dinodes++;
2438 		*generation = rbm.rgd->rd_igeneration++;
2439 		if (*generation == 0)
2440 			*generation = rbm.rgd->rd_igeneration++;
2441 	}
2442 
2443 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2444 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2445 
2446 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2447 	if (dinode)
2448 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2449 
2450 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2451 
2452 	rbm.rgd->rd_free_clone -= *nblocks;
2453 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2454 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2455 	*bn = block;
2456 	return 0;
2457 
2458 rgrp_error:
2459 	gfs2_rgrp_error(rbm.rgd);
2460 	return -EIO;
2461 }
2462 
2463 /**
2464  * __gfs2_free_blocks - free a contiguous run of block(s)
2465  * @ip: the inode these blocks are being freed from
2466  * @rgd: the resource group the blocks are in
2467  * @bstart: first block of a run of contiguous blocks
2468  * @blen: the length of the block run
2469  * @meta: 1 if the blocks represent metadata
2470  *
2471  */
2472 
__gfs2_free_blocks(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,int meta)2473 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2474 			u64 bstart, u32 blen, int meta)
2475 {
2476 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2477 
2478 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2479 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2480 	rgd->rd_free += blen;
2481 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2482 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2483 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2484 
2485 	/* Directories keep their data in the metadata address space */
2486 	if (meta || ip->i_depth)
2487 		gfs2_meta_wipe(ip, bstart, blen);
2488 }
2489 
2490 /**
2491  * gfs2_free_meta - free a contiguous run of data block(s)
2492  * @ip: the inode these blocks are being freed from
2493  * @rgd: the resource group the blocks are in
2494  * @bstart: first block of a run of contiguous blocks
2495  * @blen: the length of the block run
2496  *
2497  */
2498 
gfs2_free_meta(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen)2499 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2500 		    u64 bstart, u32 blen)
2501 {
2502 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2503 
2504 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2505 	gfs2_statfs_change(sdp, 0, +blen, 0);
2506 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2507 }
2508 
gfs2_unlink_di(struct inode * inode)2509 void gfs2_unlink_di(struct inode *inode)
2510 {
2511 	struct gfs2_inode *ip = GFS2_I(inode);
2512 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2513 	struct gfs2_rgrpd *rgd;
2514 	u64 blkno = ip->i_no_addr;
2515 
2516 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2517 	if (!rgd)
2518 		return;
2519 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2520 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2521 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2522 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2523 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2524 }
2525 
gfs2_free_di(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip)2526 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2527 {
2528 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2529 
2530 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2531 	if (!rgd->rd_dinodes)
2532 		gfs2_consist_rgrpd(rgd);
2533 	rgd->rd_dinodes--;
2534 	rgd->rd_free++;
2535 
2536 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2537 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2538 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2539 
2540 	gfs2_statfs_change(sdp, 0, +1, -1);
2541 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2542 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2543 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2544 }
2545 
2546 /**
2547  * gfs2_check_blk_type - Check the type of a block
2548  * @sdp: The superblock
2549  * @no_addr: The block number to check
2550  * @type: The block type we are looking for
2551  *
2552  * Returns: 0 if the block type matches the expected type
2553  *          -ESTALE if it doesn't match
2554  *          or -ve errno if something went wrong while checking
2555  */
2556 
gfs2_check_blk_type(struct gfs2_sbd * sdp,u64 no_addr,unsigned int type)2557 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2558 {
2559 	struct gfs2_rgrpd *rgd;
2560 	struct gfs2_holder rgd_gh;
2561 	struct gfs2_rbm rbm;
2562 	int error = -EINVAL;
2563 
2564 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2565 	if (!rgd)
2566 		goto fail;
2567 
2568 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2569 	if (error)
2570 		goto fail;
2571 
2572 	rbm.rgd = rgd;
2573 	error = gfs2_rbm_from_block(&rbm, no_addr);
2574 	if (WARN_ON_ONCE(error))
2575 		goto fail;
2576 
2577 	if (gfs2_testbit(&rbm, false) != type)
2578 		error = -ESTALE;
2579 
2580 	gfs2_glock_dq_uninit(&rgd_gh);
2581 fail:
2582 	return error;
2583 }
2584 
2585 /**
2586  * gfs2_rlist_add - add a RG to a list of RGs
2587  * @ip: the inode
2588  * @rlist: the list of resource groups
2589  * @block: the block
2590  *
2591  * Figure out what RG a block belongs to and add that RG to the list
2592  *
2593  * FIXME: Don't use NOFAIL
2594  *
2595  */
2596 
gfs2_rlist_add(struct gfs2_inode * ip,struct gfs2_rgrp_list * rlist,u64 block)2597 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2598 		    u64 block)
2599 {
2600 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2601 	struct gfs2_rgrpd *rgd;
2602 	struct gfs2_rgrpd **tmp;
2603 	unsigned int new_space;
2604 	unsigned int x;
2605 
2606 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2607 		return;
2608 
2609 	/*
2610 	 * The resource group last accessed is kept in the last position.
2611 	 */
2612 
2613 	if (rlist->rl_rgrps) {
2614 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2615 		if (rgrp_contains_block(rgd, block))
2616 			return;
2617 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2618 	} else {
2619 		rgd = ip->i_res.rs_rbm.rgd;
2620 		if (!rgd || !rgrp_contains_block(rgd, block))
2621 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2622 	}
2623 
2624 	if (!rgd) {
2625 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2626 		       (unsigned long long)block);
2627 		return;
2628 	}
2629 
2630 	for (x = 0; x < rlist->rl_rgrps; x++) {
2631 		if (rlist->rl_rgd[x] == rgd) {
2632 			swap(rlist->rl_rgd[x],
2633 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2634 			return;
2635 		}
2636 	}
2637 
2638 	if (rlist->rl_rgrps == rlist->rl_space) {
2639 		new_space = rlist->rl_space + 10;
2640 
2641 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2642 			      GFP_NOFS | __GFP_NOFAIL);
2643 
2644 		if (rlist->rl_rgd) {
2645 			memcpy(tmp, rlist->rl_rgd,
2646 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2647 			kfree(rlist->rl_rgd);
2648 		}
2649 
2650 		rlist->rl_space = new_space;
2651 		rlist->rl_rgd = tmp;
2652 	}
2653 
2654 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2655 }
2656 
2657 /**
2658  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2659  *      and initialize an array of glock holders for them
2660  * @rlist: the list of resource groups
2661  *
2662  * FIXME: Don't use NOFAIL
2663  *
2664  */
2665 
gfs2_rlist_alloc(struct gfs2_rgrp_list * rlist)2666 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2667 {
2668 	unsigned int x;
2669 
2670 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2671 				      sizeof(struct gfs2_holder),
2672 				      GFP_NOFS | __GFP_NOFAIL);
2673 	for (x = 0; x < rlist->rl_rgrps; x++)
2674 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2675 				LM_ST_EXCLUSIVE, 0,
2676 				&rlist->rl_ghs[x]);
2677 }
2678 
2679 /**
2680  * gfs2_rlist_free - free a resource group list
2681  * @rlist: the list of resource groups
2682  *
2683  */
2684 
gfs2_rlist_free(struct gfs2_rgrp_list * rlist)2685 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2686 {
2687 	unsigned int x;
2688 
2689 	kfree(rlist->rl_rgd);
2690 
2691 	if (rlist->rl_ghs) {
2692 		for (x = 0; x < rlist->rl_rgrps; x++)
2693 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2694 		kfree(rlist->rl_ghs);
2695 		rlist->rl_ghs = NULL;
2696 	}
2697 }
2698 
2699