diff options
47 files changed, 3986 insertions, 2222 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 new file mode 100644 index 00000000000..4e79074de28 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -0,0 +1,81 @@ +What: /sys/fs/ext4/<disk>/mb_stats +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Controls whether the multiblock allocator should + collect statistics, which are shown during the unmount. + 1 means to collect statistics, 0 means not to collect + statistics + +What: /sys/fs/ext4/<disk>/mb_group_prealloc +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The multiblock allocator will round up allocation + requests to a multiple of this tuning parameter if the + stripe size is not set in the ext4 superblock + +What: /sys/fs/ext4/<disk>/mb_max_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The maximum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4/<disk>/mb_min_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The minimum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4/<disk>/mb_order2_req +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Tuning parameter which controls the minimum size for + requests (as a power of 2) where the buddy cache is + used + +What: /sys/fs/ext4/<disk>/mb_stream_req +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Files which have fewer blocks than this tunable + parameter will have their blocks allocated out of a + block group specific preallocation pool, so that small + files are packed closely together. Each large file + will have its blocks allocated out of its own unique + preallocation pool. + +What: /sys/fs/ext4/<disk>/inode_readahead +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Tuning parameter which controls the maximum number of + inode table blocks that ext4's inode table readahead + algorithm will pre-read into the buffer cache + +What: /sys/fs/ext4/<disk>/delayed_allocation_blocks +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of blocks + that are dirty in the page cache, but which do not + have their location in the filesystem allocated yet. + +What: /sys/fs/ext4/<disk>/lifetime_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of kilobytes + of data that have been written to this filesystem since it was + created. + +What: /sys/fs/ext4/<disk>/session_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of + kilobytes of data that have been written to this + filesystem since it was mounted. diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index cec829bc729..97882df0486 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be * extent format more robust in face of on-disk corruption due to magics, * internal redundancy in tree * improved file allocation (multi-block alloc) -* fix 32000 subdirectory limit +* lift 32000 subdirectory limit imposed by i_links_count[1] * nsec timestamps for mtime, atime, ctime, create time * inode version field on disk (NFSv4, Lustre) * reduced e2fsck time via uninit_bg feature @@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force the ordering) +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + 2.2 Candidate features for future inclusion * Online defrag (patches available but not well tested) @@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata performance. barrier=<0|1(*)> This enables/disables the use of write barriers in - the jbd code. barrier=0 disables, barrier=1 enables. - This also requires an IO stack which can support +barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. +nobarrier This also requires an IO stack which can support barriers, and if jbd gets an error on a barrier write, it will disable again with a warning. Write barriers enforce proper on-disk ordering @@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in safe to use, at some performance penalty. If your disks are battery-backed in one way or another, disabling barriers may safely improve performance. + The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for + consistency with other ext4 mount options. inode_readahead=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode @@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the a slightly higher priority than the default I/O priority. +auto_da_alloc(*) Many broken applications don't use fsync() when +noauto_da_alloc replacing existing files via patterns such as + fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, + fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). + If auto_da_alloc is enabled, ext4 will detect + the replace-via-rename and replace-via-truncate + patterns and force that any delayed allocation + blocks are allocated such that at the next + journal commit, in the default data=ordered + mode, the data blocks of the new file are forced + to disk before the rename() operation is + commited. This provides roughly the same level + of guarantees as ext3, and avoids the + "zero-length" problem that can happen when a + system crashes before the delayed allocation + blocks are forced to disk. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 830bad7cce0..efc4fd9f40c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname> File Content mb_groups details of multiblock allocator buddy cache of free blocks mb_history multiblock allocation history - stats controls whether the multiblock allocator should start - collecting statistics, which are shown during the unmount - group_prealloc the multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if the - stripe size is not set in the ext4 superblock - max_to_scan The maximum number of extents the multiblock allocator - will search to find the best extent - min_to_scan The minimum number of extents the multiblock allocator - will search to find the best extent - order2_req Tuning parameter which controls the minimum size for - requests (as a power of 2) where the buddy cache is - used - stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out of a - block group specific preallocation pool, so that small - files are packed closely together. Each large file - will have its blocks allocated out of its own unique - preallocation pool. -inode_readahead Tuning parameter which controls the maximum number of - inode table blocks that ext4's inode table readahead - algorithm will pre-read into the buffer cache .............................................................................. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d2cf5a54a4b..9adf5e4f7e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o + compression.o delayed-ref.o else # Normal Makefile diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72677ce2b74..b30986f00b9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,12 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -86,12 +92,6 @@ struct btrfs_inode { */ u64 logged_trans; - /* - * trans that last made a change that should be fully fsync'd. This - * gets reset to zero each time the inode is logged - */ - u64 log_dirty_trans; - /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ @@ -121,6 +121,25 @@ struct btrfs_inode { /* the start of block group preferred for allocations. */ u64 block_group; + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 37f31b5529a..dbb72412463 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. - * - * prealloc_dest -- if you have already reserved a destination for the cow, - * this uses that block instead of allocating a new one. - * btrfs_alloc_reserved_extent is used to finish the allocation. */ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - u64 prealloc_dest) + u64 search_start, u64 empty_size) { u64 parent_start; struct extent_buffer *cow; @@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); - if (prealloc_dest) { - struct btrfs_key ins; - - ins.objectid = prealloc_dest; - ins.offset = buf->len; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ret = btrfs_alloc_reserved_extent(trans, root, parent_start, - root->root_key.objectid, - trans->transid, level, &ins); - BUG_ON(ret); - cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len, level); - } else { - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, - root->root_key.objectid, - trans->transid, level, - search_start, empty_size); - } + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, root->root_key.objectid, + trans->transid, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest) + struct extent_buffer **cow_ret) { u64 search_start; int ret; @@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - WARN_ON(prealloc_dest); return 0; } @@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, - prealloc_dest); + parent_slot, cow_ret, search_start, 0); return ret; } @@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize), 0); + (end_slot - i) * blocksize)); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); BUG_ON(ret); spin_lock(&root->node_lock); @@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, child->start, + child->len, mid->start, child->start, root->root_key.objectid, trans->transid, level - 1); @@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; + if (trans->transaction->delayed_refs.flushing && + btrfs_header_nritems(mid) > 2) + return 0; + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left, 0); + parent, pslot - 1, &left); if (wret) { ret = wret; goto enospc; @@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right, 0); + parent, pslot + 1, &right); if (wret) { ret = wret; goto enospc; @@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left, 0); + pslot - 1, &left); if (ret) wret = 1; else { @@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, 0); + &right); if (ret) wret = 1; else { @@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root u8 lowest_level = 0; u64 blocknr; u64 gen; - struct btrfs_key prealloc_block; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (ins_len < 0) lowest_unlock = 2; - prealloc_block.objectid = 0; - again: if (p->skip_locking) b = btrfs_root_node(root); @@ -1529,44 +1508,11 @@ again: !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { goto cow_done; } - - /* ok, we have to cow, is our old prealloc the right - * size? - */ - if (prealloc_block.objectid && - prealloc_block.offset != b->len) { - btrfs_release_path(root, p); - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - prealloc_block.objectid = 0; - goto again; - } - - /* - * for higher level blocks, try not to allocate blocks - * with the block and the parent locks held. - */ - if (level > 0 && !prealloc_block.objectid) { - u32 size = b->len; - u64 hint = b->start; - - btrfs_release_path(root, p); - ret = btrfs_reserve_extent(trans, root, - size, size, 0, - hint, (u64)-1, - &prealloc_block, 0); - BUG_ON(ret); - goto again; - } - btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], - &b, prealloc_block.objectid); - prealloc_block.objectid = 0; + p->slots[level + 1], &b); if (wret) { free_extent_buffer(b); ret = wret; @@ -1742,12 +1688,8 @@ done: * we don't really know what they plan on doing with the path * from here on, so for now just mark it as blocking */ - btrfs_set_path_blocking(p); - if (prealloc_block.objectid) { - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - } + if (!p->leave_spinning) + btrfs_set_path_blocking(p); return ret; } @@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, int ret; eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); BUG_ON(ret); btrfs_set_lock_blocking(eb); @@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, } ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb, 0); + &eb); BUG_ON(ret); if (root->root_key.objectid == @@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->start, c->start, + lower->len, lower->start, c->start, root->root_key.objectid, trans->transid, level - 1); BUG_ON(ret); @@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else { + } else if (!trans->transaction->delayed_refs.flushing) { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems) { struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; + struct extent_buffer *upper = path->nodes[1]; struct btrfs_disk_key disk_key; int slot; u32 i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; - u32 left_nritems; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; int ret; - slot = path->slots[1]; - if (!path->nodes[1]) - return 1; - - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, 0); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; - if (empty) nr = 0; else @@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (path->slots[0] >= left_nritems) push_space += data_size; + slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { item = btrfs_item_nr(left, i); @@ -2528,24 +2432,82 @@ out_unlock: } /* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, data_size, empty, + right, free_space, left_nritems); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* * push some |