aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/btrfs/extent-tree.c148
-rw-r--r--fs/btrfs/extent_io.c39
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c192
-rw-r--r--fs/btrfs/inode.c88
-rw-r--r--fs/btrfs/ioctl.c184
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/print-tree.c9
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/reada.c9
-rw-r--r--fs/btrfs/scrub.c19
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/sysfs.c32
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c24
-rw-r--r--fs/btrfs/volumes.c66
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/zlib.c2
28 files changed, 685 insertions, 317 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 92371c41422..1daea0b4718 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
spin_lock(workspace_lock);
if (*num_workspace < num_online_cpus()) {
- list_add_tail(workspace, idle_workspace);
+ list_add(workspace, idle_workspace);
(*num_workspace)++;
spin_unlock(workspace_lock);
goto wake;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7e2c1c1ef3..be91397f4e9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
@@ -3316,7 +3324,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data);
+ struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2af6e66fe78..eea26e1b2fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -36,6 +36,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
@@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8bb4aa19898..08e65e9cf2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
- btrfs_tree_read_unlock_blocking(eb);
+ if (need_lock)
+ btrfs_tree_read_unlock_blocking(eb);
return ret;
}
@@ -2904,7 +2905,9 @@ retry_root_backup:
if (ret)
goto fail_qgroup;
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
printk(KERN_WARNING
"BTRFS: failed to recover relocation\n");
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fafb3e53ecd..813537f362f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 num_bytes, int reserve,
+ int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -3260,7 +3261,8 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root, SPACE_CACHE) ||
+ block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -5613,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
@@ -5631,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+ u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -5650,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -5669,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
- struct btrfs_space_info *space_info;
down_write(&fs_info->commit_root_sem);
@@ -5692,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->commit_root_sem);
- list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
- percpu_counter_set(&space_info->total_bytes_pinned, 0);
-
update_global_block_rsv(fs_info);
}
@@ -5732,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -6206,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6365,6 +6371,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -6379,7 +6449,7 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags)
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6467,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -6481,7 +6552,7 @@ search:
u64 offset;
int cached;
- btrfs_get_block_group(block_group);
+ btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
/*
@@ -6529,16 +6600,16 @@ have_block_group:
* the refill lock keeps out other
* people trying to start a new cluster
*/
- spin_lock(&last_ptr->refill_lock);
- used_block_group = last_ptr->block_group;
- if (used_block_group != block_group &&
- (!used_block_group ||
- used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
goto refill_cluster;
- if (used_block_group != block_group)
- btrfs_get_block_group(used_block_group);
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
@@ -6552,16 +6623,15 @@ have_block_group:
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group,
+ delalloc);
block_group = used_block_group;
}
goto checks;
}
WARN_ON(last_ptr->block_group != used_block_group);
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
-refill_cluster:
+release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
@@ -6578,8 +6648,10 @@ refill_cluster:
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
- last_ptr->block_group != block_group) {
+ used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
goto unclustered_alloc;
}
@@ -6589,6 +6661,10 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
@@ -6696,7 +6772,7 @@ checks:
BUG_ON(offset > search_start);
ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type);
+ alloc_type, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -6708,13 +6784,13 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);
@@ -6827,7 +6903,7 @@ again:
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data)
+ struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
@@ -6837,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags);
+ flags, delalloc);
if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
@@ -6862,7 +6938,8 @@ again:
}
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len, int pin)
+ u64 start, u64 len,
+ int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -6881,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);
@@ -6891,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}
int btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len)
+ u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0);
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1);
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -7114,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT);
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -7256,7 +7333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
- empty_size, hint, &ins, 0);
+ empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -8659,6 +8736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
start);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f25a9092b94..a389820d158 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2354,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
- int ret;
+ int ret = 0;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -5068,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long min_len, char **map,
unsigned long *map_start,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8b63f2d4651..ccc264e7bde 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
- wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
@@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+ unsigned long start,
+ unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1874aee69c8..225302b39af 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em)
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e7fd8a56a14..b2991fd8583 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e472441feb5..1f2b99cb55e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
write_bytes -= copied;
total_copied += copied;
- /* Return to btrfs_file_aio_write to fault page */
+ /* Return to btrfs_file_write_iter to fault page */
if (unlikely(copied == 0))
break;
@@ -1675,27 +1675,22 @@ again:
}
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- size_t count, size_t ocount)
+ struct iov_iter *from,
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
+ if (written < 0 || !iov_iter_count(from))
return written;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = __btrfs_buffered_write(file, &i, pos);
+ written_buffered = __btrfs_buffered_write(file, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1730,9 +1725,8 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1741,18 +1735,12 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
u64 end_pos;
ssize_t num_written = 0;
ssize_t err = 0;
- size_t count, ocount;
+ size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ loff_t pos = iocb->ki_pos;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
- count = ocount;
-
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
@@ -1765,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
+ iov_iter_truncate(from, count);
+
err = file_remove_suid(file);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -1806,14 +1796,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = __btrfs_direct_write(iocb, iov, nr_segs,
- pos, count, ocount);
+ num_written = __btrfs_direct_write(iocb, from, pos);
} else {
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, num_written);
-
- num_written = __btrfs_buffered_write(file, &i, pos);
+ num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
iocb->ki_pos = pos + num_written;
}
@@ -2740,11 +2725,11 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
- .aio_write = btrfs_file_aio_write,
+ .write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 372b05ff194..2b0a627cb5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
};
static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root)
+ struct btrfs_root *root, int write)
{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
memset(io_ctl, 0, sizeof(struct io_ctl));
- io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
- GFP_NOFS);
+
+ io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
io_ctl->root = root;
- if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
- io_ctl->check_crcs = 1;
+ io_ctl->check_crcs = check_crcs;
+
return 0;
}
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
if (ret)
return ret;
@@ -957,19 +978,18 @@ fail:
}
static noinline_for_stack int
-add_ioctl_entries(struct btrfs_root *root,
- struct inode *inode,
- struct btrfs_block_group_cache *block_group,
- struct io_ctl *io_ctl,
- struct extent_state **cached_state,
- struct list_head *bitmap_list,
- int *entries)
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct io_ctl *io_ctl,
+ int *entries)
{
u64 start, extent_start, extent_end, len;
- struct list_head *pos, *n;
struct extent_io_tree *unpin = NULL;
int ret;
+ if (!block_group)
+ return 0;
+
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
@@ -979,23 +999,19 @@ add_ioctl_entries(struct btrfs_root *root,
*/
unpin = root->fs_info->pinned_extents;
- if (block_group)
- start = block_group->key.objectid;
+ start = block_group->key.objectid;
- while (block_group && (start < block_group->key.objectid +
- block_group->key.offset)) {
+ while (start < block_group->key.objectid + block_group->key.offset) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
- if (ret) {
- ret = 0;
- break;
- }
+ if (ret)
+ return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->key.objectid +
block_group->key.offset)
- break;
+ return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->key.objectid +
@@ -1005,11 +1021,20 @@ add_ioctl_entries(struct btrfs_root *root,
*entries += 1;
ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
start = extent_end;
}
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
/* Write out the bitmaps */
list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
@@ -1017,36 +1042,24 @@ add_ioctl_entries(struct btrfs_root *root,
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
list_del_init(&entry->list);
}
- /* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(io_ctl);
-
- ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
- 0, i_size_read(inode), cached_state);
- io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state, GFP_NOFS);
+ return 0;
+}
- if (ret)
- goto fail;
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (ret) {
+ if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
GFP_NOFS);
- goto fail;
- }
- return 0;
-fail:
- return -1;
-
-out_nospc:
- return -ENOSPC;
+ return ret;
}
static void noinline_for_stack
@@ -1056,6 +1069,7 @@ cleanup_write_cache_enospc(struct inode *inode,
struct list_head *bitmap_list)
{
struct list_head *pos, *n;
+
list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
@@ -1088,64 +1102,104 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
{
struct extent_state *cached_state = NULL;
struct io_ctl io_ctl;
- struct list_head bitmap_list;
+ LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
- int err = -1;
-
- INIT_LIST_HEAD(&bitmap_list);
if (!i_size_read(inode))
return -1;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 1);
if (ret)
return -1;
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+ }
+
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
-
- /* Make sure we can fit our crcs into the first page */
- if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
- goto out_nospc;
-
io_ctl_set_generation(&io_ctl, trans->transid);
+ /* Write out the extent entries in the free space cache */
ret = write_cache_extent_entries(&io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
if (ret)
goto out_nospc;
- ret = add_ioctl_entries(root, inode, block_group, &io_ctl,
- &cached_state, &bitmap_list, &entries);
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ */
+ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+ if (ret)
+ goto out_nospc;
- if (ret == -ENOSPC)
+ /* At last, we write out all the bitmaps. */
+ ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ if (ret)
goto out_nospc;
- else if (ret)
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(&io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(&io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
goto out;
- err = update_cache_item(trans, root, inode, path, offset,
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
entries, bitmaps);
-
out:
io_ctl_free(&io_ctl);
- if (err) {
+ if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return err;
+ return ret;
out_nospc:
-
cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
goto out;
}
@@ -1165,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
+
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7fa5f7fd7bc..3668048e16f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -693,7 +693,7 @@ retry:
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint, &ins, 1);
+ 0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
@@ -794,7 +794,7 @@ retry:
out:
return ret;
out_free_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- &ins, 1);
+ &ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -995,7 +995,7 @@ out:
return ret;
out_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2599,6 +2599,21 @@ out_kfree:
return NULL;
}
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -2698,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -2750,7 +2769,7 @@ out:
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_len, 1);
}
@@ -6535,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
- alloc_hint, &ins, 1);
+ alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
@@ -7437,7 +7456,7 @@ free_ordered:
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len);
+ ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -7445,39 +7464,30 @@ free_ordered:
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ const struct iov_iter *iter, loff_t offset)
{
int seg;
int i;
- size_t size;
- unsigned long addr;
unsigned blocksize_mask = root->sectorsize - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
if (offset & blocksize_mask)
goto out;
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (rw & WRITE)
- continue;
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (i = seg + 1; i < nr_segs; i++) {
- if (iov[seg].iov_base == iov[i].iov_base)
+ /* If this is a write we don't need to check anymore */
+ if (rw & WRITE)
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
goto out;
}
}
@@ -7487,8 +7497,7 @@ out:
}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -7498,8 +7507,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs))
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
return 0;
atomic_inc(&inode->i_dio_count);
@@ -7511,7 +7519,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
* we need to flush the dirty pages again to make absolutely sure
* that any outstanding dirty pages are on disk.
*/
- count = iov_length(iov, nr_segs);
+ count = iov_iter_count(iter);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
filemap_fdatawrite_range(inode->i_mapping, offset, count);
@@ -7538,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
if (ret < 0 && ret != -EIOCBQUEUED)
@@ -8819,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1);
+ *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8833,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
+ ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 82c18ba12e3..47aceb494d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
void btrfs_update_iflags(struct inode *inode)
{
struct btrfs_inode *ip = BTRFS_I(inode);
-
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ unsigned int new_fl = 0;
if (ip->flags & BTRFS_INODE_SYNC)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (ip->flags & BTRFS_INODE_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (ip->flags & BTRFS_INODE_APPEND)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (ip->flags & BTRFS_INODE_NOATIME)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ new_fl);
}
/*
@@ -1957,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- char *buf,
+ size_t *buf_size,
+ char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
{
@@ -1989,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
if (!key_in_sk(key, sk))
continue;
- if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
item_len = 0;
+ ret = -EOVERFLOW;
+ }
- if (sizeof(sh) + item_len + *sk_offset >
- BTRFS_SEARCH_ARGS_BUFSIZE) {
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
ret = 1;
- goto overflow;
+ goto out;
}
sh.objectid = key->objectid;
@@ -2005,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
sh.transid = found_transid;
/* copy search result header */
- memcpy(buf + *sk_offset, &sh, sizeof(sh));
+ if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += sizeof(sh);
if (item_len) {
- char *p = buf + *sk_offset;
+ char __user *up = ubuf + *sk_offset;
/* copy the item */
- read_extent_buffer(leaf, p,
- item_off, item_len);
+ if (read_extent_buffer_to_user(leaf, up,
+ item_off, item_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += item_len;
}
(*num_found)++;
- if (*num_found >= sk->nr_items)
- break;
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
}
advance_key:
ret = 0;
@@ -2033,22 +2062,37 @@ advance_key:
key->objectid++;
} else
ret = 1;
-overflow:
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
return ret;
}
static noinline int search_ioctl(struct inode *inode,
- struct btrfs_ioctl_search_args *args)
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf)
{
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_ioctl_search_key *sk = &args->key;
struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2082,14 +2126,15 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, args->buf,
+ ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
- if (ret || num_found >= sk->nr_items)
+ if (ret)
break;
}
- ret = 0;
+ if (ret > 0)
+ ret = 0;
err:
sk->nr_items = num_found;
btrfs_free_path(path);
@@ -2099,22 +2144,73 @@ err:
static noinline int btrfs_ioctl_tree_search(struct file *file,
void __user *argp)
{
- struct btrfs_ioctl_search_args *args;
- struct inode *inode;
- int ret;
+ struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_key sk;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = memdup_user(argp, sizeof(*args));
- if (IS_ERR(args))
- return PTR_ERR(args);
+ uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
inode = file_inode(file);
- ret = search_ioctl(inode, args);
- if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
ret = -EFAULT;
- kfree(args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 args;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+ const size_t buf_limit = 16 * 1024 * 1024;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+ return -EOVERFLOW;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
return ret;
}
@@ -3046,7 +3142,6 @@ out:
static void clone_update_extent_map(struct inode *inode,
const struct btrfs_trans_handle *trans,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
const u64 hole_offset,
const u64 hole_len)
{
@@ -3061,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode,
return;
}
- if (fi) {
+ if (path) {
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
em->generation = -1;
if (btrfs_file_extent_type(path->nodes[0], fi) ==
@@ -3415,18 +3514,15 @@ process_slot:
btrfs_item_ptr_offset(leaf, slot),
size);
inode_add_bytes(inode, datal);
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
}
/* If we have an implicit hole (NO_HOLES feature). */
if (drop_start < new_key.offset)
clone_update_extent_map(inode, trans,
- path, NULL, drop_start,
+ NULL, drop_start,
new_key.offset - drop_start);
- clone_update_extent_map(inode, trans, path,
- extent, 0, 0);
+ clone_update_extent_map(inode, trans, path, 0, 0);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -3469,12 +3565,10 @@ process_slot:
btrfs_end_transaction(trans, root);
goto out;
}
+ clone_update_extent_map(inode, trans, NULL, last_dest_end,
+ destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
destoff, olen);
- if (ret)
- goto out;
- clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
- destoff + len - last_dest_end);
}
out:
@@ -5198,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f237..5665d214924 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers))
return 0;
- read_lock(&eb->lock);
+ if (!read_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
return 0;
- write_lock(&eb->lock);
+
+ if (!write_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
atomic_dec(&eb->write_locks);
if (blockers) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6efd70d3b64..9626b4ad3b9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
-static void print_extent_item(struct extent_buffer *eb, int slot)
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- int type;
u32 item_size = btrfs_item_size_nr(eb, slot);
u64 flags;
u64 offset;
@@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
flags);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
@@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_disk_root_refs(l, ri));
break;
case BTRFS_EXTENT_ITEM_KEY:
- print_extent_item(l, i);
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
printk(KERN_INFO "\t\ttree block backref\n");
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cf5aead95a7..98cb6b2630f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1798,8 +1798,10 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
return -ENOMEM;
tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
+ if (!tmp) {
+ ulist_free(qgroups);
return -ENOMEM;
+ }
btrfs_get_tree_mod_seq(fs_info, &elem);
ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4055291a523..4a88f073fdd 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* pages are going to be uptodate.
*/
for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
- if (rbio->faila == stripe ||
- rbio->failb == stripe)
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->bbio->error);
continue;
+ }
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
struct page *p;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 30947f92362..09230cf3a24 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
continue;
}
if (!dev->bdev) {
- /* cannot read ahead on missing device */
- continue;
+ /*
+ * cannot read ahead on missing device, but for RAID5/6,
+ * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+ * device for such case.
+ */
+ if (nzones > 1)
+ continue;
}
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ac80188eec8..b6d198f5181 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2725,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start) {
- key.offset = found_key.offset + length;
- btrfs_release_path(path);
- continue;
- }
+ if (found_key.offset + length <= start)
+ goto skip;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2740,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -2802,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
-
+skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4662d92a4b7..8e16bca69c5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_ssd_spread:
btrfs_set_and_info(root, SSD_SPREAD,
"use spread ssd allocation scheme");
+ btrfs_set_opt(info->mount_opt, SSD);
break;
case Opt_nossd:
- btrfs_clear_and_info(root, NOSSD,
+ btrfs_set_and_info(root, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
break;
@@ -1467,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
/* recover relocation */
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret)
goto restore;
@@ -1808,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
list_for_each_entry(dev, head, dev_list) {
if (dev->missing)
continue;
+ if (!dev->name)
+ continue;
if (!first_dev || dev->devid < first_dev->devid)
first_dev = dev;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index df39458f148..78699364f53 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -605,14 +605,37 @@ static void init_feature_attrs(void)
}
}
-static int add_device_membership(struct btrfs_fs_info *fs_info)
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!fs_info->device_dir_kobj)
+ return -EINVAL;
+
+ if (one_device) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_info->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
{
int error = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *dev;
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
+ if (!fs_info->device_dir_kobj)
+ fs_info->device_dir_kobj = kobject_create_and_add("devices",
&fs_info->super_kobj);
+
if (!fs_info->device_dir_kobj)
return -ENOMEM;
@@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
if (!dev->bdev)
continue;
+ if (one_device && one_device != dev)
+ continue;
+
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
@@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
- error = add_device_membership(fs_info);
+ error = btrfs_kobj_add_device(fs_info, NULL);
if (error)
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9ab576318a8..ac46df37504 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index a5dcacb5df9..9626252ee6b 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -135,7 +135,7 @@ restart:
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
- eb = radix_tree_deref_slot(slot);
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
if (!eb)
continue;
/* Shouldn't happen but that kind of thinking creates CVE's */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index fa691b754aa..ec3dcb20235 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -415,6 +415,8 @@ int btrfs_test_qgroups(void)
ret = -ENOMEM;
goto out;
}
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
root->alloc_bytenr += 8192;
tmp_root = btrfs_alloc_dummy_root();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9630f10f8e1..5f379affdf2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
bool reloc_reserved = false;
int ret;
+ /* Send isn't supposed to start transactions. */
+ ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
- if (current->journal_info &&
- current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
+ if (current->journal_info) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
@@ -491,6 +493,7 @@ again:
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
may_wait_transaction(root, type)) {
+ current->journal_info = h;
btrfs_commit_transaction(h, root);
goto again;
}
@@ -1284,11 +1287,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- pending->error = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (pending->error)
- goto no_free_objectid;
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@@ -1613,11 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_run_delayed_items(trans, root);
- /*
- * running the delayed items may have added new refs. account
- * them now so that they hinder processing of more delayed refs
- * as little as possible.
- */
if (ret)
return ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ffeed6d6326..6104676857f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,7 @@
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
- name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
- if (!name) {
- kfree(device);
- goto error;
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
}
- rcu_assign_pointer(device->name, name);
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
@@ -1680,6 +1683,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev)
device->fs_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_kobj_rm_device(root->fs_info, device);
+
call_rcu(&device->rcu, free_device);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
+
+ /* add sysfs device entry */
+ btrfs_kobj_add_device(root->fs_info, device);
+
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = init_first_rw_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+
+ /* Sprouting would change fsid of the mounted root,
+ * so rename the fsid on the sysfs
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+ root->fs_info->fsid);
+ if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
if (ret) {
@@ -2205,6 +2224,7 @@ error_trans:
unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
+ btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2543,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- kfree(map);
- em->bdev = NULL;
-
/* once for the tree */
free_extent_map(em);
/* once for us */
@@ -4301,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
@@ -4346,7 +4365,6 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
@@ -4558,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -5362,6 +5379,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ kfree(bbio);
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
@@ -5402,12 +5428,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio = bbio->orig_bio;
}
- /*
- * We have original bio now. So increment bi_remaining to
- * account for it in endio
- */
- atomic_inc(&bio->bi_remaining);
-
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5424,9 +5444,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(bbio);
- bio_endio(bio, err);
+ btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
@@ -5589,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- kfree(bbio);
- bio_endio(bio, -EIO);
+
+ btrfs_end_bbio(bbio, bio, -EIO);
}
}
@@ -5681,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
@@ -5822,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5846,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5854,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1a15bbeb65e..2aaa00c4781 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,11 +190,14 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_bio {
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
+ unsigned long flags;
void *private;
atomic_t error;
int max_errors;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 4f196314c0c..b67d8fc8127 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (workspace->def_strm.total_in > 8192 &&
workspace->def_strm.total_in <
workspace->def_strm.total_out) {
- ret = -EIO;
+ ret = -E2BIG;
goto out;
}
/* we need another page for writing out. Test this