aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c1312
1 files changed, 765 insertions, 547 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,11 +21,20 @@
* mballoc.c contains the multiblocks allocation routines
*/
+#include "ext4_jbd2.h"
#include "mballoc.h"
-#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -70,8 +79,8 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> length for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
@@ -126,7 +135,8 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
* 512 blocks. This can be tuned via
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
@@ -338,7 +348,7 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
/* We create slab caches for groupinfo data structures based on the
* superblock block size. There will be one per mounted filesystem for
@@ -356,7 +366,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -394,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
+}
+
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -424,7 +441,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(max == NULL);
if (order > e4b->bd_blkbits + 1) {
@@ -435,10 +452,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
/* at order 0 we see each particular block */
if (order == 0) {
*max = 1 << (e4b->bd_blkbits + 3);
- return EXT4_MB_BITMAP(e4b);
+ return e4b->bd_bitmap;
}
- bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
return bb;
@@ -459,7 +476,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += first + i;
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -580,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 0 */
+ /* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
for (j = 0; j < (1 << order); j++) {
k = (i * (1 << order)) + j;
MB_CHECK_ASSERT(
- !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+ !mb_test_bit(k, e4b->bd_bitmap));
}
count++;
}
@@ -653,7 +670,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t chunk;
unsigned short border;
- BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
@@ -705,7 +722,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
ext4_grpblk_t len;
@@ -734,13 +752,18 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u blocks in bitmap, %u in gd",
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -753,6 +776,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -781,11 +822,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
int groups_per_page;
int err = 0;
int i;
- ext4_group_t first_group;
+ ext4_group_t first_group, group;
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
- struct buffer_head **bh;
+ struct buffer_head **bh = NULL;
struct inode *inode;
char *data;
char *bitmap;
@@ -805,24 +846,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
- err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, GFP_NOFS);
- if (bh == NULL)
+ if (bh == NULL) {
+ err = -ENOMEM;
goto out;
+ }
} else
bh = &bhs;
first_group = page->index * blocks_per_page / 2;
/* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
- struct ext4_group_desc *desc;
-
- if (first_group + i >= ngroups)
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
break;
- grinfo = ext4_get_group_info(sb, first_group + i);
+ grinfo = ext4_get_group_info(sb, group);
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
@@ -833,73 +873,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
bh[i] = NULL;
continue;
}
-
- err = -EIO;
- desc = ext4_get_group_desc(sb, first_group + i, NULL);
- if (desc == NULL)
- goto out;
-
- err = -ENOMEM;
- bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
- if (bh[i] == NULL)
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
goto out;
-
- if (bitmap_uptodate(bh[i]))
- continue;
-
- lock_buffer(bh[i]);
- if (bitmap_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
}
- ext4_lock_group(sb, first_group + i);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh[i],
- first_group + i, desc);
- set_bitmap_uptodate(bh[i]);
- set_buffer_uptodate(bh[i]);
- ext4_unlock_group(sb, first_group + i);
- unlock_buffer(bh[i]);
- continue;
- }
- ext4_unlock_group(sb, first_group + i);
- if (buffer_uptodate(bh[i])) {
- /*
- * if not uninit if bh is uptodate,
- * bitmap is also uptodate
- */
- set_bitmap_uptodate(bh[i]);
- unlock_buffer(bh[i]);
- continue;
- }
- get_bh(bh[i]);
- /*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
- */
- set_bitmap_uptodate(bh[i]);
- bh[i]->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh[i]);
- mb_debug(1, "read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page; i++)
- if (bh[i])
- wait_on_buffer(bh[i]);
-
- err = -EIO;
- for (i = 0; i < groups_per_page; i++)
- if (bh[i] && !buffer_uptodate(bh[i]))
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
goto out;
+ }
+ }
- err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
- int group;
-
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
@@ -1003,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1015,10 +1005,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block++;
pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_buddy_page = page;
return 0;
@@ -1050,6 +1039,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
struct page *page;
int ret = 0;
+ might_sleep();
mb_debug(1, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
@@ -1058,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
* would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1076,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
if (e4b.bd_buddy_page == NULL) {
/*
@@ -1096,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
ext4_mb_put_buddy_page_lock(&e4b);
return ret;
@@ -1121,6 +1111,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
+ might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1154,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1181,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1210,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1249,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
int order = 1;
void *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = EXT4_MB_BUDDY(e4b);
+ bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
@@ -1283,6 +1284,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1301,18 +1329,95 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+ int first, int count)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
+ BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1320,83 +1425,77 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
- if (block && max)
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+ if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u); block bitmap corrupt.",
+ block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
+
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
- else if (!block && !max)
+ else if (!left_is_free && !right_is_free)
e4b->bd_info->bb_fragments++;
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
-
- if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
- ext4_fsblk_t blocknr;
-
- blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += block;
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u)", block);
- }
- mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
-
- if (!buddy2)
- break;
-
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
- int ord;
+ int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1406,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1423,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
next = (block + 1) * (1 << order);
- if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+ if (mb_test_bit(next, e4b->bd_bitmap))
break;
- ord = mb_find_order_for_block(e4b, next);
+ order = mb_find_order_for_block(e4b, next);
- order = ord;
block = next >> order;
ex->fe_len += 1 << order;
}
@@ -1467,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain fragments counter */
if (start != 0)
- mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
- max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
if (mlen && max)
e4b->bd_info->bb_fragments++;
else if (!mlen && !max)
@@ -1512,7 +1607,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1598,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1624,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;
BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
- BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
@@ -1690,7 +1785,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1711,18 +1806,27 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
@@ -1811,7 +1915,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
@@ -1823,25 +1927,25 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
- EXT4_BLOCKS_PER_GROUP(sb), i);
- if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
* we have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But bitmap says 0",
free);
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
/*
@@ -1851,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1871,7 +1975,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
{
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
@@ -1887,11 +1991,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -1911,6 +2016,15 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
+ free = grp->bb_free;
+ if (free == 0)
+ return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1918,10 +2032,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
- free = grp->bb_free;
fragments = grp->bb_fragments;
- if (free == 0)
- return 0;
if (fragments == 0)
return 0;
@@ -1929,15 +2040,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
- if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
-
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -2028,7 +2143,12 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- if (group == ngroups)
+ cond_resched();
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
@@ -2052,7 +2172,7 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2125,8 +2245,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct super_block *sb = seq->private;
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
- int err;
+ int err, buddy_loaded = 0;
struct ext4_buddy e4b;
+ struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
ext4_grpblk_t counters[16];
@@ -2143,15 +2264,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
- return 0;
+ grinfo = ext4_get_group_info(sb, group);
+ /* Load the group info in memory only if not already loaded. */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ buddy_loaded = 1;
}
- ext4_lock_group(sb, group);
+
memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
+
+ if (buddy_loaded)
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2176,7 +2303,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
{
- struct super_block *sb = PDE(inode)->data;
+ struct super_block *sb = PDE_DATA(inode);
int rc;
rc = seq_open(file, &ext4_mb_seq_groups_ops);
@@ -2205,6 +2332,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
+
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
+
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
+}
+
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
@@ -2225,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
EXT4_DESC_PER_BLOCK_BITS(sb);
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
- ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
"for a buddy group");
goto exit_meta_group_info;
}
@@ -2237,12 +2397,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
- ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
- memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
@@ -2252,10 +2411,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
*/
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
meta_group_info[i]->bb_free =
- ext4_free_blocks_after_init(sb, group, desc);
+ ext4_free_clusters_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- ext4_free_blks_count(sb, desc);
+ ext4_free_group_clusters(sb, desc);
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2294,49 +2453,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- int num_meta_group_infos;
- int num_meta_group_infos_max;
- int array_size;
+ int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
- /* This is the number of blocks used by GDT */
- num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
- 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
- /*
- * This is the total number of blocks used by GDT including
- * the number of reserved blocks for GDT.
- * The s_group_info array is allocated with this value
- * to allow a clean online resize without a complex
- * manipulation of pointer.
- * The drawback is the unused memory when no resize
- * occurs but it's very low in terms of pages
- * (see comments below)
- * Need to handle this properly when META_BG resizing is allowed
- */
- num_meta_group_infos_max = num_meta_group_infos +
- le16_to_cpu(es->s_reserved_gdt_blocks);
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /*
- * array_size is the size of s_group_info array. We round it
- * to the next power of two because this approximation is done
- * internally by kmalloc so we can have some more memory
- * for free here (e.g. may be used for META_BG resize).
- */
- array_size = 1;
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2364,7 +2488,7 @@ err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
- i = num_meta_group_infos;
+ i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2423,7 +2547,7 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
@@ -2473,7 +2597,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
/*
* If there is a s_stripe > 1, then we set the s_mb_group_prealloc
* to the lowest multiple of s_stripe which is bigger than
@@ -2503,21 +2640,23 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
- if (ret != 0) {
- goto out;
- }
+ if (ret != 0)
+ goto out_free_locality_groups;
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
- if (sbi->s_journal)
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+ return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
out:
- if (ret) {
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- }
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
return ret;
}
@@ -2548,6 +2687,9 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2595,18 +2737,18 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t block, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
- discard_block = block + ext4_group_first_block_no(sb, block_group);
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2616,96 +2758,67 @@ static inline int ext4_issue_discard(struct super_block *sb,
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- struct super_block *sb = journal->j_private;
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- struct ext4_free_data *entry;
- struct list_head *l, *ltmp;
- list_for_each_safe(l, ltmp, &txn->t_private_list) {
- entry = list_entry(l, struct ext4_free_data, list);
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
+
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
- entry->count, entry->group, entry);
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
- db = e4b.bd_info;
- /* there are blocks to put in buddy to make them really free */
- count += entry->count;
- count2++;
- ext4_lock_group(sb, entry->group);
- /* Take it out of per group rb tree */
- rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
- /*
- * Clear the trimmed flag for the group so that the next
- * ext4_trim_fs can trim it.
- * If the volume is mounted with -o discard, online discard
- * is supported and the free blocks will be trimmed online.
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
*/
- if (!test_opt(sb, DISCARD))
- EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
- if (!db->bb_free_root.rb_node) {
- /* No more items in the per group rb tree
- * balance refcounts from ext4_mb_free_metadata()
- */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
- }
- ext4_unlock_group(sb, entry->group);
- kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_unload_buddy(&e4b);
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
-
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
-
-static void __init ext4_create_debugfs_entry(void)
-{
- debugfs_dir = debugfs_create_dir("ext4", NULL);
- if (debugfs_dir)
- debugfs_debug = debugfs_create_u8("mballoc-debug",
- S_IRUGO | S_IWUSR,
- debugfs_dir,
- &mb_enable_debug);
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
- debugfs_remove(debugfs_debug);
- debugfs_remove(debugfs_dir);
-}
-
-#else
-
-static void __init ext4_create_debugfs_entry(void)
-{
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2720,14 +2833,13 @@ int __init ext4_init_mballoc(void)
return -ENOMEM;
}
- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
- SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_ext_cachep == NULL) {
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
- ext4_create_debugfs_entry();
return 0;
}
@@ -2740,9 +2852,8 @@ void ext4_exit_mballoc(void)
rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
- kmem_cache_destroy(ext4_free_ext_cachep);
+ kmem_cache_destroy(ext4_free_data_cachep);
ext4_groupinfo_destroy_slabs();
- ext4_remove_debugfs_entry();
}
@@ -2752,7 +2863,7 @@ void ext4_exit_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_blks)
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
@@ -2773,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -2783,18 +2895,19 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
goto out_err;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
- ext4_free_blks_count(sb, gdp));
+ ext4_free_group_clusters(sb, gdp));
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- len = ac->ac_b_ex.fe_len;
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
- "fs metadata\n", block, block+len);
+ "fs metadata", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
@@ -2823,28 +2936,30 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp,
- ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_blks_set(sb, gdp, len);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
- atomic_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2853,7 +2968,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
- ext4_mark_super_dirty(sb);
brelse(bitmap_bh);
return err;
}
@@ -2886,9 +3000,11 @@ static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
- loff_t size, orig_size, start_off;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *pa;
@@ -2916,7 +3032,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -2988,7 +3104,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3135,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3043,7 +3162,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3097,13 +3216,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
- int len;
-
- if (pa && pa->pa_type == MB_INODE_PA) {
- len = ac->ac_b_ex.fe_len;
- pa->pa_free += len;
- }
+ if (pa && pa->pa_type == MB_INODE_PA)
+ pa->pa_free += ac->ac_b_ex.fe_len;
}
/*
@@ -3112,14 +3227,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3244,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
@@ -3193,6 +3310,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
@@ -3210,12 +3328,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3290,8 +3410,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
n = rb_first(&(grp->bb_free_root));
while (n) {
- entry = rb_entry(n, struct ext4_free_data, node);
- ext4_set_bits(bitmap, entry->start_blk, entry->count);
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -3312,7 +3432,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t groupnr;
ext4_grpblk_t start;
int preallocated = 0;
- int count = 0;
int len;
/* all form of preallocation discards first load group,
@@ -3335,7 +3454,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
BUG_ON(groupnr != group);
ext4_set_bits(bitmap, start, len);
preallocated += len;
- count++;
}
mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
@@ -3344,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3357,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_group_t grp;
ext4_fsblk_t grp_blk;
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
-
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -3378,7 +3501,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3412,6 +3535,7 @@ static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3443,16 +3567,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3477,7 +3603,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3718,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3607,7 +3733,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
free += next - bit;
trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
- trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
@@ -3638,7 +3765,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- trace_ext4_mb_release_group_pa(pa);
+ trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3690,7 +3817,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
}
if (needed == 0)
- needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
INIT_LIST_HEAD(&list);
repeat:
@@ -3724,11 +3851,7 @@ repeat:
if (free < needed && busy) {
busy = 0;
ext4_unlock_group(sb, group);
- /*
- * Yield the CPU here so that we don't get soft lockup
- * in non preempt case.
- */
- yield();
+ cond_resched();
goto repeat;
}
@@ -3843,7 +3966,7 @@ repeat:
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
@@ -3879,15 +4002,15 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
struct super_block *sb = ac->ac_sb;
ext4_group_t ngroups, i;
- if (!mb_enable_debug ||
+ if (!ext4_mballoc_debug ||
(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
" Allocation context details:");
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
ac->ac_status, ac->ac_flags);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
"goal %lu/%lu/%lu@%lu, "
"best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
@@ -3903,9 +4026,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
- ac->ac_ex_scanned, ac->ac_found);
- ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -3958,7 +4080,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -3969,6 +4091,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;
}
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
/* don't use group allocation for large files */
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4134,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
- len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4018,19 +4145,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- memset(ac, 0, sizeof(struct ext4_allocation_context));
- ac->ac_b_ex.fe_logical = ar->logical;
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll we work with a file or
@@ -4104,7 +4227,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
@@ -4142,7 +4265,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
- rcu_read_lock();
+ spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
@@ -4166,12 +4289,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
- rcu_read_unlock();
+ spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
- order, lg_prealloc_count);
+ order, lg_prealloc_count);
return;
}
return ;
@@ -4182,13 +4305,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
- pa->pa_pstart += ac->ac_b_ex.fe_len;
- pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
@@ -4249,13 +4373,18 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
- unsigned int reserv_blks = 0;
+ unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
/*
* For delayed allocation, we could skip the ENOSPC and
* EDQUOT check, as blocks and quotas have been already
@@ -4269,22 +4398,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
* and verify allocation doesn't exceed the quota limits.
*/
while (ar->len &&
- ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
/* let others to free the space */
- yield();
+ cond_resched();
ar->len = ar->len >> 1;
}
if (!ar->len) {
*errp = -ENOSPC;
return 0;
}
- reserv_blks = ar->len;
+ reserv_clstrs = ar->len;
if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
- dquot_alloc_block_nofail(ar->inode, ar->len);
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
} else {
while (ar->len &&
- dquot_alloc_block(ar->inode, ar->len)) {
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
@@ -4297,7 +4428,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
@@ -4318,17 +4449,22 @@ repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
if (*errp)
- goto errout;
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp == -EAGAIN) {
/*
* drop the reference that we took
@@ -4340,10 +4476,10 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
- } else if (*errp)
- errout:
+ } else if (*errp) {
ext4_discard_allocated_blocks(ac);
- else {
+ goto errout;
+ } else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
@@ -4354,6 +4490,7 @@ repeat:
*errp = -ENOSPC;
}
+errout:
if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
@@ -4364,13 +4501,13 @@ out:
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
- dquot_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
if (!ext4_test_inode_state(ar->inode,
EXT4_STATE_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
}
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4386,9 +4523,9 @@ out:
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if ((entry1->t_tid == entry2->t_tid) &&
- (entry1->group == entry2->group) &&
- ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
return 1;
return 0;
}
@@ -4398,7 +4535,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
ext4_group_t group = e4b->bd_group;
- ext4_grpblk_t block;
+ ext4_grpblk_t cluster;
struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
@@ -4410,8 +4547,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- new_node = &new_entry->node;
- block = new_entry->start_blk;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
if (!*n) {
/* first free block exent. We need to
@@ -4424,14 +4561,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
while (*n) {
parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, node);
- if (block < entry->start_blk)
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
n = &(*n)->rb_left;
- else if (block >= (entry->start_blk + entry->count))
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
- ext4_group_first_block_no(sb, group) + block,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
return 0;
}
@@ -4443,34 +4581,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(entry, new_entry)) {
- new_entry->start_blk = entry->start_blk;
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
node = rb_next(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(new_entry, entry)) {
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
/* Add the extent to transaction's private list */
- spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
- spin_unlock(&sbi->s_md_lock);
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}
@@ -4489,16 +4622,18 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
- unsigned long freed = 0;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
+ unsigned int count_clusters;
int err = 0;
int ret;
+ might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -4524,10 +4659,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
@@ -4544,18 +4680,56 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!ext4_should_writeback_data(inode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
+ count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4570,9 +4744,9 @@ do_more:
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
+ EXT4_SB(sb)->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ EXT4_SB(sb)->s_itb_per_group)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
@@ -4597,11 +4771,11 @@ do_more:
#ifdef AGGRESSIVE_CHECK
{
int i;
- for (i = 0; i < count; i++)
+ for (i = 0; i < count_clusters; i++)
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
@@ -4613,43 +4787,74 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ retry:
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
- err = -ENOMEM;
- goto error_return;
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
}
- new_entry->start_blk = bit;
- new_entry->group = block_group;
- new_entry->count = count;
- new_entry->t_tid = handle->h_transaction->t_tid;
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
/* need to update group_info->bb_free and bitmap
* with group lock held. generate_buddy look at
* them with group lock_held
*/
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(inode, &e4b, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_blks_count(sb, gdp) + count;
- ext4_free_blks_set(sb, gdp, ret);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
- ext4_mb_unload_buddy(&e4b);
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
- freed += count;
+ ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4667,10 +4872,7 @@ do_more:
put_bh(bitmap_bh);
goto do_more;
}
- ext4_mark_super_dirty(sb);
error_return:
- if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
- dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
@@ -4680,7 +4882,7 @@ error_return:
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
- * @block: start physcial block to add to the block group
+ * @block: start physical block to add to the block group
* @count: number of blocks to free
*
* This marks the blocks as free in the bitmap and buddy.
@@ -4778,16 +4980,18 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(NULL, &e4b, bit, count);
- blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
- ext4_free_blks_set(sb, desc, blk_free_count);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, blocks_freed));
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(blocks_freed,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
ext4_mb_unload_buddy(&e4b);
@@ -4820,10 +5024,13 @@ error_return:
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
+ int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -4839,9 +5046,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
+ return ret;
}
/**
@@ -4870,7 +5078,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret;
+ int ret = 0;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -4890,15 +5098,18 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
start = (e4b.bd_info->bb_first_free > start) ?
e4b.bd_info->bb_first_free : start;
- while (start < max) {
- start = mb_find_next_zero_bit(bitmap, max, start);
- if (start >= max)
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
break;
- next = mb_find_next_bit(bitmap, max, start);
+ next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ ret = ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
count += next - start;
}
free_count += next - start;
@@ -4919,8 +5130,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
break;
}
- if (!ret)
+ if (!ret) {
+ ret = count;
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -4928,7 +5141,7 @@ out:
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- return count;
+ return ret;
}
/**
@@ -4946,37 +5159,38 @@ out:
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
struct ext4_group_info *grp;
- ext4_group_t first_group, last_group;
- ext4_group_t group, ngroups = ext4_get_groups_count(sb);
- ext4_grpblk_t cnt = 0, first_block, last_block;
- uint64_t start, len, minlen, trimmed = 0;
+ ext4_group_t group, first_group, last_group;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+ uint64_t start, end, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
- len = range->len >> sb->s_blocksize_bits;
- minlen = range->minlen >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
- if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
return -EINVAL;
- if (start + len <= first_data_blk)
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
goto out;
- if (start < first_data_blk) {
- len -= first_data_blk - start;
+ if (start < first_data_blk)
start = first_data_blk;
- }
- /* Determine first and last group to examine based on start and len */
+ /* Determine first and last group to examine based on start and end */
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
- &first_group, &first_block);
- ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
- &last_group, &last_block);
- last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_block = EXT4_BLOCKS_PER_GROUP(sb);
+ &first_group, &first_cluster);
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+ &last_group, &last_cluster);
- if (first_group > last_group)
- return -EINVAL;
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
@@ -4988,31 +5202,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
/*
- * For all the groups except the last one, last block will
- * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
- * change it for the last group in which case start +
- * len < EXT4_BLOCKS_PER_GROUP(sb).
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
*/
- if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
- last_block = first_block + len;
- len -= last_block - first_block;
+ if (group == last_group)
+ end = last_cluster;
if (grp->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, group, first_block,
- last_block, minlen);
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
}
+ trimmed += cnt;
}
- trimmed += cnt;
- first_block = 0;
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
+ first_cluster = 0;
}
- range->len = trimmed * sb->s_blocksize;
if (!ret)
atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
out:
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}