aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/proc.txt21
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c588
-rw-r--r--fs/btrfs/ctree.h69
-rw-r--r--fs/btrfs/delayed-ref.c669
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c1674
-rw-r--r--fs/btrfs/extent_io.c51
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c194
-rw-r--r--fs/btrfs/locking.c21
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c444
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c424
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--include/linux/jbd2.h6
47 files changed, 3986 insertions, 2222 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 00000000000..4e79074de28
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What: /sys/fs/ext4/<disk>/mb_stats
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Controls whether the multiblock allocator should
+ collect statistics, which are shown during the unmount.
+ 1 means to collect statistics, 0 means not to collect
+ statistics
+
+What: /sys/fs/ext4/<disk>/mb_group_prealloc
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if the
+ stripe size is not set in the ext4 superblock
+
+What: /sys/fs/ext4/<disk>/mb_max_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The maximum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_min_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The minimum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_order2_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the minimum size for
+ requests (as a power of 2) where the buddy cache is
+ used
+
+What: /sys/fs/ext4/<disk>/mb_stream_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out of a
+ block group specific preallocation pool, so that small
+ files are packed closely together. Each large file
+ will have its blocks allocated out of its own unique
+ preallocation pool.
+
+What: /sys/fs/ext4/<disk>/inode_readahead
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the maximum number of
+ inode table blocks that ext4's inode table readahead
+ algorithm will pre-read into the buffer cache
+
+What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of blocks
+ that are dirty in the page cache, but which do not
+ have their location in the filesystem allocated yet.
+
+What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of kilobytes
+ of data that have been written to this filesystem since it was
+ created.
+
+What: /sys/fs/ext4/<disk>/session_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc729..97882df0486 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
* extent format more robust in face of on-disk corruption due to magics,
* internal redundancy in tree
* improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
* nsec timestamps for mtime, atime, ctime, create time
* inode version field on disk (NFSv4, Lustre)
* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
the ordering)
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
2.2 Candidate features for future inclusion
* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
performance.
barrier=<0|1(*)> This enables/disables the use of write barriers in
- the jbd code. barrier=0 disables, barrier=1 enables.
- This also requires an IO stack which can support
+barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier This also requires an IO stack which can support
barriers, and if jbd gets an error on a barrier
write, it will disable again with a warning.
Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
safe to use, at some performance penalty. If
your disks are battery-backed in one way or another,
disabling barriers may safely improve performance.
+ The mount options "barrier" and "nobarrier" can
+ also be used to enable or disable barriers, for
+ consistency with other ext4 mount options.
inode_readahead=n This tuning parameter controls the maximum
number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
a slightly higher priority than the default I/O
priority.
+auto_da_alloc(*) Many broken applications don't use fsync() when
+noauto_da_alloc replacing existing files via patterns such as
+ fd = open("foo.new")/write(fd,..)/close(fd)/
+ rename("foo.new", "foo"), or worse yet,
+ fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+ If auto_da_alloc is enabled, ext4 will detect
+ the replace-via-rename and replace-via-truncate
+ patterns and force that any delayed allocation
+ blocks are allocated such that at the next
+ journal commit, in the default data=ordered
+ mode, the data blocks of the new file are forced
+ to disk before the rename() operation is
+ commited. This provides roughly the same level
+ of guarantees as ext3, and avoids the
+ "zero-length" problem that can happen when a
+ system crashes before the delayed allocation
+ blocks are forced to disk.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0..efc4fd9f40c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
File Content
mb_groups details of multiblock allocator buddy cache of free blocks
mb_history multiblock allocation history
- stats controls whether the multiblock allocator should start
- collecting statistics, which are shown during the unmount
- group_prealloc the multiblock allocator will round up allocation
- requests to a multiple of this tuning parameter if the
- stripe size is not set in the ext4 superblock
- max_to_scan The maximum number of extents the multiblock allocator
- will search to find the best extent
- min_to_scan The minimum number of extents the multiblock allocator
- will search to find the best extent
- order2_req Tuning parameter which controls the minimum size for
- requests (as a power of 2) where the buddy cache is
- used
- stream_req Files which have fewer blocks than this tunable
- parameter will have their blocks allocated out of a
- block group specific preallocation pool, so that small
- files are packed closely together. Each large file
- will have its blocks allocated out of its own unique
- preallocation pool.
-inode_readahead Tuning parameter which controls the maximum number of
- inode table blocks that ext4's inode table readahead
- algorithm will pre-read into the buffer cache
..............................................................................
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b..9adf5e4f7e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
- compression.o
+ compression.o delayed-ref.o
else
# Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74..b30986f00b9 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
+ /*
+ * list for tracking inodes that must be sent to disk before a
+ * rename or truncate commit
+ */
+ struct list_head ordered_operations;
+
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
*/
u64 logged_trans;
- /*
- * trans that last made a change that should be fully fsync'd. This
- * gets reset to zero each time the inode is logged
- */
- u64 log_dirty_trans;
-
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
@@ -121,6 +121,25 @@ struct btrfs_inode {
/* the start of block group preferred for allocations. */
u64 block_group;
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ *
+ * yes, its silly to have a single bitflag, but we might grow more
+ * of these.
+ */
+ unsigned ordered_data_close:1;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529a..dbb72412463 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
* empty_size -- a hint that you plan on doing more cow. This is the size in
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
*/
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- u64 prealloc_dest)
+ u64 search_start, u64 empty_size)
{
u64 parent_start;
struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
level = btrfs_header_level(buf);
nritems = btrfs_header_nritems(buf);
- if (prealloc_dest) {
- struct btrfs_key ins;
-
- ins.objectid = prealloc_dest;
- ins.offset = buf->len;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
- root->root_key.objectid,
- trans->transid, level, &ins);
- BUG_ON(ret);
- cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
- buf->len, level);
- } else {
- cow = btrfs_alloc_free_block(trans, root, buf->len,
- parent_start,
- root->root_key.objectid,
- trans->transid, level,
- search_start, empty_size);
- }
+ cow = btrfs_alloc_free_block(trans, root, buf->len,
+ parent_start, root->root_key.objectid,
+ trans->transid, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest)
+ struct extent_buffer **cow_ret)
{
u64 search_start;
int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_owner(buf) == root->root_key.objectid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
*cow_ret = buf;
- WARN_ON(prealloc_dest);
return 0;
}
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0,
- prealloc_dest);
+ parent_slot, cow_ret, search_start, 0);
return ret;
}
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize), 0);
+ (end_slot - i) * blocksize));
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(!child);
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
BUG_ON(ret);
spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, child->start,
+ child->len,
mid->start, child->start,
root->root_key.objectid,
trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
+ if (trans->transaction->delayed_refs.flushing &&
+ btrfs_header_nritems(mid) > 2)
+ return 0;
+
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left, 0);
+ parent, pslot - 1, &left);
if (wret) {
ret = wret;
goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right, 0);
+ parent, pslot + 1, &right);
if (wret) {
ret = wret;
goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left, 0);
+ pslot - 1, &left);
if (ret)
wret = 1;
else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right, 0);
+ &right);
if (ret)
wret = 1;
else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
u8 lowest_level = 0;
u64 blocknr;
u64 gen;
- struct btrfs_key prealloc_block;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (ins_len < 0)
lowest_unlock = 2;
- prealloc_block.objectid = 0;
-
again:
if (p->skip_locking)
b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
!btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
goto cow_done;
}
-
- /* ok, we have to cow, is our old prealloc the right
- * size?
- */
- if (prealloc_block.objectid &&
- prealloc_block.offset != b->len) {
- btrfs_release_path(root, p);
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- prealloc_block.objectid = 0;
- goto again;
- }
-
- /*
- * for higher level blocks, try not to allocate blocks
- * with the block and the parent locks held.
- */
- if (level > 0 && !prealloc_block.objectid) {
- u32 size = b->len;
- u64 hint = b->start;
-
- btrfs_release_path(root, p);
- ret = btrfs_reserve_extent(trans, root,
- size, size, 0,
- hint, (u64)-1,
- &prealloc_block, 0);
- BUG_ON(ret);
- goto again;
- }
-
btrfs_set_path_blocking(p);
wret = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1],
- &b, prealloc_block.objectid);
- prealloc_block.objectid = 0;
+ p->slots[level + 1], &b);
if (wret) {
free_extent_buffer(b);
ret = wret;
@@ -1742,12 +1688,8 @@ done:
* we don't really know what they plan on doing with the path
* from here on, so for now just mark it as blocking
*/
- btrfs_set_path_blocking(p);
- if (prealloc_block.objectid) {
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- }
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
return ret;
}
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
int ret;
eb = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
BUG_ON(ret);
btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
}
ret = btrfs_cow_block(trans, root, eb, parent, slot,
- &eb, 0);
+ &eb);
BUG_ON(ret);
if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, lower->start,
- lower->start, c->start,
+ lower->len, lower->start, c->start,
root->root_key.objectid,
trans->transid, level - 1);
BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
- } else {
+ } else if (!trans->transaction->delayed_refs.flushing) {
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes. returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems)
{
struct extent_buffer *left = path->nodes[0];
- struct extent_buffer *right;
- struct extent_buffer *upper;
+ struct extent_buffer *upper = path->nodes[1];
struct btrfs_disk_key disk_key;
int slot;
u32 i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
- u32 left_nritems;
u32 nr;
u32 right_nritems;
u32 data_end;
u32 this_item_size;
int ret;
- slot = path->slots[1];
- if (!path->nodes[1])
- return 1;
-
- upper = path->nodes[1];
- if (slot >= btrfs_header_nritems(upper) - 1)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- right = read_node_slot(root, upper, slot + 1);
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right, 0);
- if (ret)
- goto out_unlock;
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- left_nritems = btrfs_header_nritems(left);
- if (left_nritems == 0)
- goto out_unlock;
-
if (empty)
nr = 0;
else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (path->slots[0] >= left_nritems)
push_space += data_size;
+ slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
}
/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ return __push_leaf_right(trans, root, path, data_size, empty,
+ right, free_space, left_nritems);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
* push some