diff options
58 files changed, 5173 insertions, 2057 deletions
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt index 5dd282dda55..d11cc2f8077 100644 --- a/Documentation/filesystems/btrfs.txt +++ b/Documentation/filesystems/btrfs.txt @@ -38,7 +38,7 @@ Mount Options ============= When mounting a btrfs filesystem, the following option are accepted. -Unless otherwise specified, all options default to off. +Options with (*) are default options and will not show in the mount options. alloc_start=<bytes> Debugging option to force all block allocations above a certain @@ -46,10 +46,12 @@ Unless otherwise specified, all options default to off. bytes, optionally with a K, M, or G suffix, case insensitive. Default is 1MB. + noautodefrag(*) autodefrag - Detect small random writes into files and queue them up for the - defrag process. Works best for small files; Not well suited for - large database workloads. + Disable/enable auto defragmentation. + Auto defragmentation detects small random writes into files and queue + them up for the defrag process. Works best for small files; + Not well suited for large database workloads. check_int check_int_data @@ -96,21 +98,26 @@ Unless otherwise specified, all options default to off. can be avoided. Especially useful when trying to mount a multi-device setup as root. May be specified multiple times for multiple devices. + nodiscard(*) discard - Issue frequent commands to let the block device reclaim space freed by - the filesystem. This is useful for SSD devices, thinly provisioned + Disable/enable discard mount option. + Discard issues frequent commands to let the block device reclaim space + freed by the filesystem. + This is useful for SSD devices, thinly provisioned LUNs and virtual machine images, but may have a significant performance impact. (The fstrim command is also available to initiate batch trims from userspace). + noenospc_debug(*) enospc_debug - Debugging option to be more verbose in some ENOSPC conditions. + Disable/enable debugging option to be more verbose in some ENOSPC conditions. fatal_errors=<action> Action to take when encountering a fatal error: "bug" - BUG() on a fatal error. This is the default. "panic" - panic() on a fatal error. + noflushoncommit(*) flushoncommit The 'flushoncommit' mount option forces any data dirtied by a write in a prior transaction to commit as part of the current commit. This makes @@ -134,26 +141,32 @@ Unless otherwise specified, all options default to off. Specify that 1 metadata chunk should be allocated after every <value> data chunks. Off by default. + acl(*) noacl - Disable support for Posix Access Control Lists (ACLs). See the + Enable/disable support for Posix Access Control Lists (ACLs). See the acl(5) manual page for more information about ACLs. + barrier(*) nobarrier - Disables the use of block layer write barriers. Write barriers ensure - that certain IOs make it through the device cache and are on persistent - storage. If used on a device with a volatile (non-battery-backed) - write-back cache, this option will lead to filesystem corruption on a - system crash or power loss. + Enable/disable the use of block layer write barriers. Write barriers + ensure that certain IOs make it through the device cache and are on + persistent storage. If disabled on a device with a volatile + (non-battery-backed) write-back cache, nobarrier option will lead to + filesystem corruption on a system crash or power loss. + datacow(*) nodatacow - Disable data copy-on-write for newly created files. Implies nodatasum, - and disables all compression. + Enable/disable data copy-on-write for newly created files. + Nodatacow implies nodatasum, and disables all compression. + datasum(*) nodatasum - Disable data checksumming for newly created files. + Enable/disable data checksumming for newly created files. + Datasum implies datacow. + treelog(*) notreelog - Disable the tree logging used for fsync and O_SYNC writes. + Enable/disable the tree logging used for fsync and O_SYNC writes. recovery Enable autorecovery attempts if a bad tree root is found at mount time. diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index aa976eced2d..a66768ebc8d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,6 +1,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 1a44e42d602..f341a98031d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o + uuid-tree.o props.o hash.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3775947429b..aded3ef3d3d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, return 0; } +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, struct extent_inode_elem **eie) @@ -209,18 +219,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, int level, - struct btrfs_key *key_for_search, u64 time_seq, - u64 wanted_disk_byte, - const u64 *extent_item_pos) + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos) { int ret = 0; int slot; struct extent_buffer *eb; struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; if (level != 0) { eb = path->nodes[level]; @@ -238,7 +249,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -254,6 +265,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; + count++; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -273,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, old = old->next; old->next = eie; } + eie = NULL; } next: ret = btrfs_next_old_item(root, path, time_seq); @@ -280,6 +293,8 @@ next: if (ret > 0) ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -299,23 +314,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; + int index; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + root = btrfs_read_fs_root_no_name(fs_info, &root_key); if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out; } root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); goto out; + } path->lowest_level = level; ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", ref->root_id, level, ref->count, ret, @@ -334,9 +360,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, level, &ref->key_for_search, - time_seq, ref->wanted_disk_byte, - extent_item_pos); + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos); out: path->lowest_level = 0; btrfs_release_path(path); @@ -376,10 +401,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos); - if (err == -ENOMEM) - goto out; - if (err) + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { continue; + } else if (err) { + ret = err; + goto out; + } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); @@ -538,14 +569,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(&op_key, &extent_op->key); - while ((n = rb_prev(n))) { + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { struct btrfs_delayed_ref_node *node; node = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - + n = rb_next(n); if (node->seq > seq) continue; @@ -612,10 +642,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, WARN_ON(1); } if (ret) - return ret; + break; } - - return 0; + spin_unlock(&head->lock); + return ret; } /* @@ -828,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -882,15 +913,15 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } + spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); - if (ret) { - spin_unlock(&delayed_refs->lock); + if (ret) goto out; - } + } else { + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); } if (path->slots[0]) { @@ -941,7 +972,6 @@ again: goto out; } if (ref->count && ref->parent) { - struct extent_inode_elem *eie = NULL; if (extent_item_pos && !ref->inode_list) { u32 bsz; struct extent_buffer *eb; @@ -976,6 +1006,7 @@ again: eie = eie->next; eie->next = ref->inode_list; } + eie = NULL; } list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); @@ -994,7 +1025,8 @@ out: list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); } - + if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -1002,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks) { struct ulist_node *node = NULL; struct extent_inode_elem *eie; - struct extent_inode_elem *eie_next; struct ulist_iterator uiter; ULIST_ITER_INIT(&uiter); @@ -1010,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks) if (!node->aux) continue; eie = (struct extent_inode_elem *)(uintptr_t)node->aux; - for (; eie; eie = eie_next) { - eie_next = eie->next; - kfree(eie); - } + free_inode_elem_list(eie); node->aux = 0; } @@ -1101,44 +1129,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!node) break; bytenr = node->val; + cond_resched(); } ulist_free(tmp); return 0; } - -static int __inode_info(u64 inum, u64 ioff, u8 key_type, - struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_key *found_key) -{ - int ret; - struct btrfs_key key; - struct extent_buffer *eb; - - key.type = key_type; - key.objectid = inum; - key.offset = ioff; - - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if (ret < 0) - return ret; - - eb = path->nodes[0]; - if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_root, path); - if (ret) - return ret; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); - if (found_key->type != key.type || found_key->objectid != key.objectid) - return 1; - - return 0; -} - /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1146,16 +1143,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path) { struct btrfs_key key; - return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, - &key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_ITEM_KEY, &key); } static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) { - return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, - found_key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_REF_KEY, found_key); } int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, @@ -1335,20 +1332,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_item(fs_info->extent_root, path, - 0, BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - return ret; - btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + while (1) { + u32 nritems; + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(fs_info->extent_root, path); + if (ret != 0) { + if (ret > 0) { + pr_debug("logical %llu is not within " + "any extent\n", logical); + ret = -ENOENT; + } + return ret; + } + } else { + path->slots[0]--; + } + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) { + pr_debug("logical %llu is not within any extent\n", + logical); + return -ENOENT; + } + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, + path->slots[0]); + if (found_key->type == BTRFS_EXTENT_ITEM_KEY || + found_key->type == BTRFS_METADATA_ITEM_KEY) + break; + } + if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; - if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && - found_key->type != BTRFS_METADATA_ITEM_KEY) || - found_key->objectid > logical || + if (found_key->objectid > logical || found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; @@ -1601,7 +1623,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1614,9 +1635,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, parent = found_key.offset; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); @@ -1674,17 +1698,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ++found; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); cur_offset = 0; while (cur_offset < item_size) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ac0b39db27d..8fed2125689 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -43,6 +43,7 @@ #define BTRFS_INODE_COPY_EVERYTHING 8 #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 /* in memory btrfs inode */ struct btrfs_inode { @@ -135,6 +136,9 @@ struct btrfs_inode { */ u64 index_cnt; + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index cb05e1c842c..49a62b4dda3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + - btrfs_stack_file_extent_offset(&file_extent_item); - generation = btrfs_stack_file_extent_generation(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f5cdeb4b553..e2600cdb6c2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode, kunmap_atomic(kaddr); if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %llu " - "extent %llu csum %u " - "wanted %u mirror %d\n", - btrfs_ino(inode), disk_start, csum, *cb_sum, - cb->mirror_num); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -411,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); } if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 316136bd6dd..cbd3a7d6fa6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -475,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * |