aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/affs/affs.h8
-rw-r--r--fs/aio.c6
-rw-r--r--fs/attr.c5
-rw-r--r--fs/binfmt_elf.c8
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/backref.c495
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h50
-rw-r--r--fs/btrfs/check-integrity.c584
-rw-r--r--fs/btrfs/ctree.c861
-rw-r--r--fs/btrfs/ctree.h78
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c15
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/file.c78
-rw-r--r--fs/btrfs/free-space-cache.c52
-rw-r--r--fs/btrfs/inode.c317
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/btrfs/ioctl.h33
-rw-r--r--fs/btrfs/ordered-data.c165
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/scrub.c65
-rw-r--r--fs/btrfs/super.c117
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c306
-rw-r--r--fs/btrfs/volumes.h52
-rw-r--r--fs/btrfs/xattr.c1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/export.c32
-rw-r--r--fs/compat.c33
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exportfs/expfs.c33
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/balloc.c41
-rw-r--r--fs/ext4/bitmap.c83
-rw-r--r--fs/ext4/dir.c12
-rw-r--r--fs/ext4/ext4.h130
-rw-r--r--fs/ext4/ext4_extents.h24
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c91
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c81
-rw-r--r--fs/ext4/inode.c119
-rw-r--r--fs/ext4/ioctl.c19
-rw-r--r--fs/ext4/mballoc.c30
-rw-r--r--fs/ext4/mmp.c44
-rw-r--r--fs/ext4/namei.c445
-rw-r--r--fs/ext4/resize.c71
-rw-r--r--fs/ext4/super.c253
-rw-r--r--fs/ext4/xattr.c92
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fcntl.c42
-rw-r--r--fs/file_table.c17
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/gfs2/export.c17
-rw-r--r--fs/hpfs/alloc.c14
-rw-r--r--fs/hpfs/anode.c43
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/dnode.c10
-rw-r--r--fs/hpfs/ea.c60
-rw-r--r--fs/hpfs/hpfs.h289
-rw-r--r--fs/hpfs/hpfs_fn.h16
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/map.c20
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c4
-rw-r--r--fs/inode.c124
-rw-r--r--fs/internal.h3
-rw-r--r--fs/isofs/export.c13
-rw-r--r--fs/jbd2/Kconfig2
-rw-r--r--fs/jbd2/commit.c70
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/recovery.c126
-rw-r--r--fs/jbd2/revoke.c27
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/jffs2/jffs2_fs_sb.h4
-rw-r--r--fs/jffs2/os-linux.h7
-rw-r--r--fs/jffs2/super.c21
-rw-r--r--fs/jffs2/wbuf.c55
-rw-r--r--fs/lockd/svc.c145
-rw-r--r--fs/locks.c5
-rw-r--r--fs/namei.c177
-rw-r--r--fs/namespace.c142
-rw-r--r--fs/ncpfs/file.c6
-rw-r--r--fs/ncpfs/ncp_fs_sb.h10
-rw-r--r--fs/nfs/callback.c13
-rw-r--r--fs/nfs/dir.c56
-rw-r--r--fs/nfs/file.c77
-rw-r--r--fs/nfsd/auth.c2
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/fault_inject.c1
-rw-r--r--fs/nfsd/nfs4callback.c5
-rw-r--r--fs/nfsd/nfs4idmap.c4
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c525
-rw-r--r--fs/nfsd/nfs4xdr.c62
-rw-r--r--fs/nfsd/nfsctl.c12
-rw-r--r--fs/nfsd/nfssvc.c23
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/xdr4.h6
-rw-r--r--fs/nilfs2/namei.c22
-rw-r--r--fs/notify/fsnotify.c12
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ocfs2/blockcheck.c42
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/export.c19
-rw-r--r--fs/ocfs2/inode.c13
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/move_extents.c6
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/symlink.c115
-rw-r--r--fs/ocfs2/symlink.h2
-rw-r--r--fs/open.c76
-rw-r--r--fs/pipe.c7
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/proc_namespace.c4
-rw-r--r--fs/readdir.c33
-rw-r--r--fs/reiserfs/inode.c30
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/reiserfs.h12
-rw-r--r--fs/reiserfs/resize.c1
-rw-r--r--fs/reiserfs/super.c74
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/splice.c6
-rw-r--r--fs/statfs.c5
-rw-r--r--fs/sync.c5
-rw-r--r--fs/ubifs/dir.c11
-rw-r--r--fs/udf/namei.c14
-rw-r--r--fs/utimes.c5
-rw-r--r--fs/xattr.c20
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h21
-rw-r--r--fs/xfs/xfs_export.c23
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
158 files changed, 6186 insertions, 2516 deletions
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a1e6c990cd4..e3dd2a1e2bf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
- struct dentry *dentry;
-
- spin_lock(&inode->i_lock);
- /* Directory should have only one entry. */
- BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
- dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
- spin_unlock(&inode->i_lock);
- return dentry;
-}
-
static int v9fs_test_inode_dotl(struct inode *inode, void *data)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
if (dir->i_mode & S_ISGID)
omode |= S_ISGID;
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid))
return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
return -EINVAL;
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 45a0ce45d7b..1fceb320d2f 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -18,14 +18,6 @@
#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS 0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS 0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data)
#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data)
diff --git a/fs/aio.c b/fs/aio.c
index 8c7c8b80537..55c4c765605 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
info->mmap_size = nr_pages * PAGE_SIZE;
dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
- 0);
+ info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0);
if (IS_ERR((void *)info->mmap_base)) {
up_write(&ctx->mm->mmap_sem);
info->mmap_size = 0;
diff --git a/fs/attr.c b/fs/attr.c
index 584620e5dee..0da90951d27 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
return -EPERM;
}
+ if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
+ if (attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
+ }
+
if ((ia_valid & ATTR_MODE)) {
umode_t amode = attr->ia_mode;
/* Flag setting protected by i_mutex */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e658dd134b9..1b52956afe3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
if (!size)
return addr;
- down_write(&current->mm->mmap_sem);
/*
* total_size is the size of the ELF (interpreter) image.
* The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
*/
if (total_size) {
total_size = ELF_PAGEALIGN(total_size);
- map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+ map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
if (!BAD_ADDR(map_addr))
- do_munmap(current->mm, map_addr+size, total_size-size);
+ vm_munmap(map_addr+size, total_size-size);
} else
- map_addr = do_mmap(filep, addr, size, prot, type, off);
+ map_addr = vm_mmap(filep, addr, size, prot, type, off);
- up_write(&current->mm->mmap_sem);
return(map_addr);
}
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 6b2daf99fab..178cb70acc2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
realdatastart = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process data, errno %d\n",
(int)-realdatastart);
- do_munmap(current->mm, textpos, text_len);
+ vm_munmap(textpos, text_len);
ret = realdatastart;
goto err;
}
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
- do_munmap(current->mm, textpos, text_len);
- do_munmap(current->mm, realdatastart, len);
+ vm_munmap(textpos, text_len);
+ vm_munmap(realdatastart, len);
ret = result;
goto err;
}
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read code+data+bss, errno %d\n",(int)-result);
- do_munmap(current->mm, textpos, text_len + data_len + extra +
+ vm_munmap(textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
ret = result;
goto err;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d6..761e2cd8fed 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
if (ret > 0) {
/* we need an acl */
ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+ } else {
+ cache_no_acl(inode);
}
+ } else {
+ cache_no_acl(inode);
}
failed:
posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec0675023..3f75895c919 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
#include "delayed-ref.h"
#include "locking.h"
+struct extent_inode_elem {
+ u64 inum;
+ u64 offset;
+ struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+ struct btrfs_file_extent_item *fi,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 data_offset;
+ u64 data_len;
+ struct extent_inode_elem *e;
+
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+
+ e = kmalloc(sizeof(*e), GFP_NOFS);
+ if (!e)
+ return -ENOMEM;
+
+ e->next = *eie;
+ e->inum = key->objectid;
+ e->offset = key->offset + (extent_item_pos - data_offset);
+ *eie = e;
+
+ return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 disk_byte;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int slot;
+ int nritems;
+ int extent_type;
+ int ret;
+
+ /*
+ * from the shared data ref, we only have the leaf but we need
+ * the key. thus, we must look into all items and see that we
+ * find one (some) with a reference to our extent item.
+ */
+ nritems = btrfs_header_nritems(eb);
+ for (slot = 0; slot < nritems; ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte != wanted_disk_byte)
+ continue;
+
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* this structure records all encountered refs on the way up to the root
*/
struct __prelim_ref {
struct list_head list;
u64 root_id;
- struct btrfs_key key;
+ struct btrfs_key key_for_search;
int level;
int count;
+ struct extent_inode_elem *inode_list;
u64 parent;
u64 wanted_disk_byte;
};
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ * block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | - | -
+ * key to resolve | - | y | y | y
+ * tree block logical | - | - | - | -
+ * root for resolving | y | y | y | y
+ *
+ * - column 1: we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | y | -
+ * key to resolve | - | - | - | y
+ * tree block logical | y | y | y | y
+ * root for resolving | - | y | y | y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2: we take the first key from the block to find the parent
+ * (see __add_missing_keys)
+ * - column 4: we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
static int __add_prelim_ref(struct list_head *head, u64 root_id,
- struct btrfs_key *key, int level, u64 parent,
- u64 wanted_disk_byte, int count)
+ struct btrfs_key *key, int level,
+ u64 parent, u64 wanted_disk_byte, int count)
{
struct __prelim_ref *ref;
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
ref->root_id = root_id;
if (key)
- ref->key = *key;
+ ref->key_for_search = *key;
else
- memset(&ref->key, 0, sizeof(ref->key));
+ memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+ ref->inode_list = NULL;
ref->level = level;
ref->count = count;
ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
}
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents,
- struct extent_buffer *eb, int level,
- u64 wanted_objectid, u64 wanted_disk_byte)
+ struct ulist *parents, int level,
+ struct btrfs_key *key, u64 wanted_disk_byte,
+ const u64 *extent_item_pos)
{
int ret;
- int slot;
+ int slot = path->slots[level];
+ struct extent_buffer *eb = path->nodes[level];
struct btrfs_file_extent_item *fi;
- struct btrfs_key key;
+ struct extent_inode_elem *eie = NULL;
u64 disk_byte;
+ u64 wanted_objectid = key->objectid;
add_parent:
- ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (level == 0 && extent_item_pos) {
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
+ if (ret < 0)
+ return ret;
+ }
+ ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
if (ret < 0)
return ret;
@@ -89,6 +211,7 @@ add_parent:
* repeat this until we don't find any additional EXTENT_DATA items.
*/
while (1) {
+ eie = NULL;
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
@@ -97,9 +220,9 @@ add_parent:
eb = path->nodes[0];
for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != wanted_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY)
+ btrfs_item_key_to_cpu(eb, key, slot);
+ if (key->objectid != wanted_objectid ||
+ key->type != BTRFS_EXTENT_DATA_KEY)
return 0;
fi = btrfs_item_ptr(eb, slot,
struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int search_commit_root,
+ u64 time_seq,
struct __prelim_ref *ref,
- struct ulist *parents)
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
struct btrfs_path *path;
struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
path->lowest_level = level;
- ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
pr_debug("search slot in root %llu (level %d, ref count %d) returned "
"%d for key (%llu %u %llu)\n",
(unsigned long long)ref->root_id, level, ref->count, ret,
- (unsigned long long)ref->key.objectid, ref->key.type,
- (unsigned long long)ref->key.offset);
+ (unsigned long long)ref->key_for_search.objectid,
+ ref->key_for_search.type,
+ (unsigned long long)ref->key_for_search.offset);
if (ret < 0)
goto out;
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
}
- /* the last two parameters will only be used for level == 0 */
- ret = add_all_parents(root, path, parents, eb, level, key.objectid,
- ref->wanted_disk_byte);
+ ret = add_all_parents(root, path, parents, level, &key,
+ ref->wanted_disk_byte, extent_item_pos);
out:
btrfs_free_path(path);
return ret;
@@ -191,8 +316,9 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- struct list_head *head)
+ int search_commit_root, u64 time_seq,
+ struct list_head *head,
+ const u64 *extent_item_pos)
{
int err;
int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct __prelim_ref *new_ref;
struct ulist *parents;
struct ulist_node *node;
+ struct ulist_iterator uiter;
parents = ulist_alloc(GFP_NOFS);
if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
if (ref->count == 0)
continue;
err = __resolve_indirect_ref(fs_info, search_commit_root,
- ref, parents);
+ time_seq, ref, parents,
+ extent_item_pos);
if (err) {
if (ret == 0)
ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
/* we put the first parent into the ref at hand */
- node = ulist_next(parents, NULL);
+ ULIST_ITER_INIT(&uiter);
+ node = ulist_next(parents, &uiter);
ref->parent = node ? node->val : 0;
+ ref->inode_list =
+ node ? (struct extent_inode_elem *)node->aux : 0;
/* additional parents require new refs being added here */
- while ((node = ulist_next(parents, node))) {
+ while ((node = ulist_next(parents, &uiter))) {
new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
if (!new_ref) {
ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
+ new_ref->inode_list =
+ (struct extent_inode_elem *)node->aux;
list_add(&new_ref->list, &ref->list);
}
ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
return ret;
}
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+ struct __prelim_ref *ref2)
+{
+ if (ref1->level != ref2->level)
+ return 0;
+ if (ref1->root_id != ref2->root_id)
+ return 0;
+ if (ref1->key_for_search.type != ref2->key_for_search.type)
+ return 0;
+ if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+ return 0;
+ if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+ return 0;
+ if (ref1->parent != ref2->parent)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+ struct list_head *head)
+{
+ struct list_head *pos;
+ struct extent_buffer *eb;
+
+ list_for_each(pos, head) {
+ struct __prelim_ref *ref;
+ ref = list_entry(pos, struct __prelim_ref, list);
+
+ if (ref->parent)
+ continue;
+ if (ref->key_for_search.type)
+ continue;
+ BUG_ON(!ref->wanted_disk_byte);
+ eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+ fs_info->tree_root->leafsize, 0);
+ BUG_ON(!eb);
+ btrfs_tree_read_lock(eb);
+ if (btrfs_header_level(eb) == 0)
+ btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return 0;
+}
+
/*
* merge two lists of backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
+ * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ * additionally, we could even add a key range for the blocks we
+ * looked into to merge even more (-> replace unresolved refs by those
+ * having a parent).
* mode = 2: merge identical parents
*/
static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
ref1 = list_entry(pos1, struct __prelim_ref, list);
- if (mode == 1 && ref1->key.type == 0)
- continue;
for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
pos2 = n2, n2 = pos2->next) {
struct __prelim_ref *ref2;
+ struct __prelim_ref *xchg;
ref2 = list_entry(pos2, struct __prelim_ref, list);
if (mode == 1) {
- if (memcmp(&ref1->key, &ref2->key,
- sizeof(ref1->key)) ||
- ref1->level != ref2->level ||
- ref1->root_id != ref2->root_id)
+ if (!ref_for_same_block(ref1, ref2))
continue;
+ if (!ref1->parent && ref2->parent) {
+ xchg = ref1;
+ ref1 = ref2;
+ ref2 = xchg;
+ }
ref1->count += ref2->count;
} else {
if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct btrfs_key *info_key,
struct list_head *prefs)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
+ struct btrfs_key key;
+ struct btrfs_key op_key = {0};
int sgn;
int ret = 0;
if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+ btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
while ((n = rb_prev(n))) {
struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ret = __add_prelim_ref(prefs, ref->root, &op_key,
ref->level + 1, 0, node->bytenr,
node->ref_mod * sgn);
break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ret = __add_prelim_ref(prefs, ref->root, NULL,
ref->level + 1, ref->parent,
node->bytenr,
node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
}
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key key;
-
ref = btrfs_delayed_node_to_data_ref(node);
key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
}
case BTRFS_SHARED_DATA_REF_KEY: {
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key key;
ref = btrfs_delayed_node_to_data_ref(node);
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
*/
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- struct btrfs_key *info_key, int *info_level,
- struct list_head *prefs)
+ int *info_level, struct list_head *prefs)
{
int ret = 0;
int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
* enumerate all inline refs
*/
leaf = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
item_size = btrfs_item_size_nr(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
- struct btrfs_disk_key disk_key;
info = (struct btrfs_tree_block_info *)ptr;
*info_level = btrfs_tree_block_level(leaf, info);
- btrfs_tree_block_key(leaf, info, &disk_key);
- btrfs_disk_key_to_cpu(info_key, &disk_key);
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
} else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
switch (type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, 0, info_key,
+ ret = __add_prelim_ref(prefs, 0, NULL,
*info_level + 1, offset,
bytenr, 1);
break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, offset, info_key,
- *info_level + 1, 0, bytenr, 1);
+ ret = __add_prelim_ref(prefs, offset, NULL,
+ *info_level + 1, 0,
+ bytenr, 1);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
- count);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
break;
}
default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
*/
static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- struct btrfs_key *info_key, int info_level,
- struct list_head *prefs)
+ int info_level, struct list_head *prefs)
{
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
switch (key.type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, 0, info_key,
+ ret = __add_prelim_ref(prefs, 0, NULL,
info_level + 1, key.offset,
bytenr, 1);
break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, key.offset, info_key,
- info_level + 1, 0, bytenr, 1);
+ ret = __add_prelim_ref(prefs, key.offset, NULL,
+ info_level + 1, 0,
+ bytenr, 1);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
- bytenr, count);
+ bytenr, count);
break;
}
default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 seq, struct ulist *refs, struct ulist *roots)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist *refs, struct ulist *roots,
+ const u64 *extent_item_pos)
{
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_key info_key = { 0 };
struct btrfs_delayed_ref_root *delayed_refs = NULL;
struct btrfs_delayed_ref_head *head;
int info_level = 0;
@@ -645,7 +830,7 @@ again:
btrfs_put_delayed_ref(&head->node);
goto again;
}
- ret = __add_delayed_refs(head, seq, &info_key,
+ ret = __add_delayed_refs(head, delayed_ref_seq,
&prefs_delayed);
if (ret) {
spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
struct extent_buffer *leaf;
int slot;
+ path->slots[0]--;
leaf = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
ret = __add_inline_refs(fs_info, path, bytenr,
- &info_key, &info_level, &prefs);
+ &info_level, &prefs);
if (ret)
goto out;
- ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+ ret = __add_keyed_refs(fs_info, path, bytenr,
info_level, &prefs);
if (ret)
goto out;
@@ -676,21 +862,18 @@ again:
}
btrfs_release_path(path);
- /*
- * when adding the delayed refs above, the info_key might not have
- * been known yet. Go over the list and replace the missing keys
- */
- list_for_each_entry(ref, &prefs_delayed, list) {
- if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
- memcpy(&ref->key, &info_key, sizeof(ref->key));
- }
list_splice_init(&prefs_delayed, &prefs);
+ ret = __add_missing_keys(fs_info, &prefs);
+ if (ret)
+ goto out;
+
ret = __merge_refs(&prefs, 1);
if (ret)
goto out;
- ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+ ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+ &prefs, extent_item_pos);
if (ret)
goto out;
@@ -709,7 +892,33 @@ again:
BUG_ON(ret < 0);
}
if (ref->count && ref->parent) {
- ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+ struct extent_inode_elem *eie = NULL;
+ if (extent_item_pos && !ref->inode_list) {
+ u32 bsz;
+ struct extent_buffer *eb;
+ bsz = btrfs_level_size(fs_info->extent_root,
+ info_level);
+ eb = read_tree_block(fs_info->extent_root,
+ ref->parent, bsz, 0);
+ BUG_ON(!eb);
+ ret = find_extent_in_eb(eb, bytenr,
+ *extent_item_pos, &eie);
+ ref->inode_list = eie;
+ free_extent_buffer(eb);
+ }
+ ret = ulist_add_merge(refs, ref->parent,
+ (unsigned long)ref->inode_list,
+ (unsigned long *)&eie, GFP_NOFS);
+ if (!ret && extent_item_pos) {
+ /*
+ * we've recorded that parent, so we must extend
+ * its inode list here
+ */
+ BUG_ON(!eie);
+ while (eie->next)
+ eie = eie->next;
+ eie->next = ref->inode_list;
+ }
BUG_ON(ret < 0);
}
kfree(ref);
@@ -734,6 +943,28 @@ out:
return ret;
}
+static void free_leaf_list(struct ulist *blocks)
+{
+ struct ulist_node *node = NULL;
+ struct extent_inode_elem *eie;
+ struct extent_inode_elem *eie_next;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(blocks, &uiter))) {
+ if (!node->aux)
+ continue;
+ eie = (struct extent_inode_elem *)node->aux;
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+ node->aux = 0;
+ }
+
+ ulist_free(blocks);
+}
+
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
*/
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **leafs)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **leafs,
+ const u64 *extent_item_pos)
{
struct ulist *tmp;
int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, *leafs, tmp, extent_item_pos);
ulist_free(tmp);
if (ret < 0 && ret != -ENOENT) {
- ulist_free(*leafs);
+ free_leaf_list(*leafs);
return ret;
}
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*/
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **roots)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
+ struct ulist_iterator uiter;
int ret;
tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
+ ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, seq,
- tmp, *roots);
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, tmp, *roots, NULL);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
return ret;
}
- node = ulist_next(tmp, node);
+ node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 0;
}
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
- u64 orig_extent_item_objectid,
- u64 extent_item_pos, u64 root,
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+ u64 root, u64 extent_item_objectid,
iterate_extent_inodes_t *iterate, void *ctx)
{
- u64 disk_byte;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *eb;
- int slot;
- int nritems;
+ struct extent_inode_elem *eie;
int ret = 0;
- int extent_type;
- u64 data_offset;
- u64 data_len;
-
- eb = read_tree_block(fs_info->tree_root, logical,
- fs_info->tree_root->leafsize, 0);
- if (!eb)
- return -EIO;
-
- /*
- * from the shared data ref, we only have the leaf but we need
- * the key. thus, we must look into all items and see that we
- * find one (some) with a reference to our extent item.
- */
- nritems = btrfs_header_nritems(eb);
- for (slot = 0; slot < nritems; ++slot) {
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(eb, fi);
- if (extent_type == BTRFS_FILE_EXTENT_INLINE)
- continue;
- /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != orig_extent_item_objectid)
- continue;
-
- data_offset = btrfs_file_extent_offset(eb, fi);
- data_len = btrfs_file_extent_num_bytes(eb, fi);
-
- if (extent_item_pos < data_offset ||
- extent_item_pos >= data_offset + data_len)
- continue;
+ for (eie = inode_list; eie; eie = eie->next) {
pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
- "root %llu\n", orig_extent_item_objectid,
- key.objectid, key.offset, root);
- ret = iterate(key.objectid,
- key.offset + (extent_item_pos - data_offset),
- root, ctx);
+ "root %llu\n", extent_item_objectid,
+ eie->inum, eie->offset, root);
+ ret = iterate(eie->inum, eie->offset, root, ctx);
if (ret) {
- pr_debug("stopping iteration because ret=%d\n", ret);
+ pr_debug("stopping iteration for %llu due to ret=%d\n",
+ extent_item_objectid, ret);
break;
}
}
- free_extent_buffer(eb);
-
return ret;
}
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list seq_elem;
+ struct seq_list seq_elem = {};
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist_iterator ref_uiter;
+ struct ulist_iterator root_uiter;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
btrfs_get_delayed_seq(delayed_refs, &seq_elem);
spin_unlock(&delayed_refs->lock);
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- extent_item_pos, seq_elem.seq,
- &refs);
-
+ seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+ &extent_item_pos);
if (ret)
goto out;
- while (!ret && (ref_node = ulist_next(refs, ref_node))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
- seq_elem.seq, &roots);
+ ULIST_ITER_INIT(&ref_uiter);
+ while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+ ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ seq_elem.seq,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
- while (!ret && (root_node = ulist_next(roots, root_node))) {
- pr_debug("root %llu references leaf %llu\n",
- root_node->val, ref_node->val);
- ret = iterate_leaf_refs(fs_info, ref_node->val,
- extent_item_objectid,
- extent_item_pos, root_node->val,
- iterate, ctx);
+ ULIST_ITER_INIT(&root_uiter);
+ while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+ pr_debug("root %llu references leaf %llu, data list "
+ "%#lx\n", root_node->val, ref_node->val,
+ ref_node->aux);
+ ret = iterate_leaf_refs(
+ (struct extent_inode_elem *)ref_node->aux,
+ root_node->val, extent_item_objectid,
+ iterate, ctx);
}
+ ulist_free(roots);
+ roots = NULL;
}
- ulist_free(refs);
+ free_leaf_list(refs);
ulist_free(roots);
out:
if (!search_commit_root) {
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_put_delayed_seq(delayed_refs, &seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4..c18d8ac7b79 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **roots);
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15fd520..e616f8872e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,20 @@
#include "ordered-data.h"
#include "delayed-inode.h"
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
+#define BTRFS_INODE_ORPHAN_META_RESERVED 1
+#define BTRFS_INODE_DUMMY 2
+#define BTRFS_INODE_IN_DEFRAG 3
+#define BTRFS_INODE_DELALLOC_META_RESERVED 4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
+
/* in memory btrfs inode */
struct btrfs_inode {
/* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
- /* for keeping track of orphaned inodes */
- struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
* to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
+ unsigned long runtime_flags;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
u64 generation;
- /* sequence number for NFS changes */
- u64 sequence;
-
/*
* transid of the trans_handle that last modified this inode
*/
@@ -145,22 +155,9 @@ struct btrfs_inode {
unsigned reserved_extents;
/*
- * ordered_data_close is set by truncate when a file that used
- * to have good data has been truncated to zero. When it is set
- * the btrfs file release call will add this inode to the
- * ordered operations list so that we make sure to flush out any
- * new data the application may have written before commit.
- */
- unsigned ordered_data_close:1;
- unsigned orphan_meta_reserved:1;
- unsigned dummy_inode:1;
- unsigned in_defrag:1;
- unsigned delalloc_meta_reserved:1;
-
- /*
* always compress this one file
*/
- unsigned force_compress:4;
+ unsigned force_compress;
struct btrfs_delayed_node *delayed_node;
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
return false;
}
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == generation &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index c053e90f200..9cebb1fd6a3 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -103,8 +103,6 @@
#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
* excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
/*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
u64 dev_bytenr; /* physical bytenr on device */
u32 len;
struct btrfsic_dev_state *dev;
- char *data;
- struct buffer_head *bh; /* do not use if set to NULL */
+ char **datav;
+ struct page **pagev;
+ void *mem_to_free;
};
/* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
struct btrfs_root *root;
u64 max_superblock_generation;
struct btrfsic_block *latest_superblock;
+ u32 metablock_size;
+ u32 datablock_size;
};
static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
static int btrfsic_process_metablock(struct btrfsic_state *state,
struct btrfsic_block *block,
struct btrfsic_block_data_ctx *block_ctx,
- struct btrfs_header *hdr,
int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dst, u32 offset, size_t len);
static int btrfsic_create_link_to_next_block(
struct btrfsic_state *state,
struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- const u8 *data, unsigned int size);
+ char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, u8 *mapped_data,
- unsigned int len, struct bio *bio,
- int *bio_is_patched,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
u64 bytenr,
struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char *data);
+ u64 dev_bytenr);
static struct mutex btrfsic_mutex;
static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
BUG_ON(NULL == state);
- selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+ selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfsic_block *next_block;
struct btrfsic_block_data_ctx tmp_next_block_ctx;
struct btrfsic_block_link *l;
- struct btrfs_header *hdr;
- ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
&tmp_next_block_ctx,
mirror_num);
if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
BUG_ON(NULL == l);
ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ if (ret < (int)PAGE_CACHE_SIZE) {
printk(KERN_INFO
"btrfsic: read @logical %llu failed!\n",
(unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
return -1;
}
- hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
ret = btrfsic_process_metablock(state,
next_block,
&tmp_next_block_ctx,
- hdr,
BTRFS_MAX_LEVEL + 3, 1);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
}
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ return -1;
+ bh = __bread(superblock_bdev, dev_bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
if (NULL == bh)
return -1;
super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
sizeof(super_tmp->magic)) ||
- memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+ btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+ btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+ btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
}
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfsic_block_data_ctx tmp_next_block_ctx;
struct btrfsic_block_link *l;
- if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ if (btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
&tmp_next_block_ctx,
mirror_num)) {
printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
struct btrfsic_state *state,
struct btrfsic_block *const first_block,
struct btrfsic_block_data_ctx *const first_block_ctx,
- struct btrfs_header *const first_hdr,
int first_limit_nesting, int force_iodone_flag)
{
struct btrfsic_stack_frame initial_stack_frame = { 0 };
struct btrfsic_stack_frame *sf;
struct btrfsic_stack_frame *next_stack;
+ struct btrfs_header *const first_hdr =
+ (struct btrfs_header *)first_block_ctx->datav[0];
+ BUG_ON(!first_hdr);
sf = &initial_stack_frame;
sf->error = 0;
sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
}
if (sf->i < sf->nr) {
- struct btrfs_item *disk_item = leafhdr->items + sf->i;
- struct btrfs_disk_key *disk_key = &disk_item->key;
+ struct btrfs_item disk_item;
+ u32 disk_item_offset =
+ (uintptr_t)(leafhdr->items + sf->i) -
+ (uintptr_t)leafhdr;
+ struct btrfs_disk_key *disk_key;
u8 type;
- const u32 item_offset = le32_to_cpu(disk_item->offset);
+ u32 item_offset;
+ if (disk_item_offset + sizeof(struct btrfs_item) >
+ sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+ printk(KERN_INFO
+ "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(sf->block_ctx,
+ &disk_item,
+ disk_item_offset,
+ sizeof(struct btrfs_item));
+ item_offset = le32_to_cpu(disk_item.offset);
+ disk_key = &disk_item.key;
type = disk_key->type;
if (BTRFS_ROOT_ITEM_KEY == type) {
- const struct btrfs_root_item *const root_item =
- (struct btrfs_root_item *)
- (sf->block_ctx->data +
- offsetof(struct btrfs_leaf, items) +
- item_offset);
- const u64 next_bytenr =
- le64_to_cpu(root_item->bytenr);
+ struct btrfs_root_item root_item;
+ u32 root_item_offset;
+ u64 next_bytenr;
+
+ root_item_offset = item_offset +
+ offsetof(struct btrfs_leaf, items);
+ if (root_item_offset +
+ sizeof(struct btrfs_root_item) >
+ sf->block_ctx->len)
+ goto leaf_item_out_of_bounce_error;
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &root_item,
+ root_item_offset,
+ sizeof(struct btrfs_root_item));
+ next_bytenr = le64_to_cpu(root_item.bytenr);
sf->error =
btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
&sf->num_copies,
&sf->mirror_num,
disk_key,
- le64_to_cpu(root_item->
+ le64_to_cpu(root_item.
generation));
if (sf->error)
goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
if (NULL != sf->next_block) {
struct btrfs_header *const next_hdr =
(struct btrfs_header *)
- sf->next_block_ctx.data;
+ sf->next_block_ctx.datav[0];
next_stack =
btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
}
if (sf->i < sf->nr) {
- struct btrfs_key_ptr *disk_key_ptr =
- nodehdr->ptrs + sf->i;
- const u64 next_bytenr =
- le64_to_cpu(disk_key_ptr->blockptr);
+ struct btrfs_key_ptr key_ptr;
+ u32 key_ptr_offset;
+ u64 next_bytenr;
+
+ key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+ (uintptr_t)nodehdr;
+ if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+ sf->block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &key_ptr, key_ptr_offset,
+ sizeof(struct btrfs_key_ptr));
+ next_bytenr = le64_to_cpu(key_ptr.blockptr);
sf->error = btrfsic_create_link_to_next_block(
state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
force_iodone_flag,
&sf->num_copies,
&sf->mirror_num,
- &disk_key_ptr->key,
- le64_to_cpu(disk_key_ptr->generation));
+ &key_ptr.key,
+ le64_to_cpu(key_ptr.generation));
if (sf->error)
goto one_stack_frame_backwards;
if (NULL != sf->next_block) {
struct btrfs_header *const next_hdr =
(struct btrfs_header *)
- sf->next_block_ctx.data;
+ sf->next_block_ctx.datav[0];
next_stack = btrfsic_stack_frame_alloc();
if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
return sf->error;
}
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dstv, u32 offset, size_t len)
+{
+ size_t cur;
+ size_t offset_in_page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(offset + len > block_ctx->len);
+ offset_in_page = (start_offset + offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+ BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT);
+ kaddr = block_ctx->datav[i];
+ memcpy(dst, kaddr + offset_in_page, cur);
+
+ dst += cur;
+ len -= cur;
+ offset_in_page = 0;
+ i++;
+ }
+}
+
static int btrfsic_create_link_to_next_block(
struct btrfsic_state *state,
struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
if (0 == *num_copiesp) {
*num_copiesp =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
"btrfsic_create_link_to_next_block(mirror_num=%d)\n",
*mirror_nump);
ret = btrfsic_map_block(state, next_bytenr,
- BTRFSIC_BLOCK_SIZE,
+ state->metablock_size,
next_block_ctx, *mirror_nump);
if (ret) {
printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
if (limit_nesting > 0 && did_alloc_block_link) {
ret = btrfsic_read_block(state, next_block_ctx);
- if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ if (ret < (int)next_block_ctx->len) {
printk(KERN_INFO
"btrfsic: read block @logical %llu failed!\n",
(unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
u32 item_offset, int force_iodone_flag)
{
int ret;
- struct btrfs_file_extent_item *file_extent_item =
- (struct btrfs_file_extent_item *)(block_ctx->data +
- offsetof(struct btrfs_leaf,
- items) + item_offset);
- u64 next_bytenr =
- le64_to_cpu(file_extent_item->disk_bytenr) +
- le64_to_cpu(file_extent_item->offset);
- u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
- u64 generation = le64_to_cpu(file_extent_item->generation);
+ struct btrfs_file_extent_item file_extent_item;
+ u64 file_extent_item_offset;
+ u64 next_bytenr;
+ u64 num_bytes;
+ u64 generation;
struct btrfsic_block_link *l;
+ file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+ item_offset;
+ if (file_extent_item_offset +
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+ ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+ file_extent_item.type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item.disk_bytenr));
+ return 0;
+ }
+
+ if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ sizeof(struct btrfs_file_extent_item));
+ next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+ le64_to_cpu(file_extent_item.offset);
+ generation = le64_to_cpu(file_extent_item.generation);
+ num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+ generation = le64_to_cpu(file_extent_item.generation);
+
if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
" offset = %llu, num_bytes = %llu\n",
- file_extent_item->type,
+ file_extent_item.type,
(unsigned long long)
- le64_to_cpu(file_extent_item->disk_bytenr),
- (unsigned long long)
- le64_to_cpu(file_extent_item->offset),
- (unsigned long long)
- le64_to_cpu(file_extent_item->num_bytes));
- if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
- ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
- return 0;
+ le64_to_cpu(file_extent_item.disk_bytenr),
+ (unsigned long long)le64_to_cpu(file_extent_item.offset),
+ (unsigned long long)num_bytes);
while (num_bytes > 0) {
u32 chunk_len;
int num_copies;
int mirror_num;
- if (num_bytes > BTRFSIC_BLOCK_SIZE)
- chunk_len = BTRFSIC_BLOCK_SIZE;
+ if (num_bytes > state->datablock_size)
+ chunk_len = state->datablock_size;
else
chunk_len = num_bytes;
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
block_ctx_out->dev_bytenr = multi->stripes[0].physical;
block_ctx_out->start = bytenr;
block_ctx_out->len = len;
- block_ctx_out->data = NULL;
- block_ctx_out->bh = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
if (0 == ret)
kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
block_ctx_out->dev_bytenr = bytenr;
block_ctx_out->start = bytenr;
block_ctx_out->len = len;
- block_ctx_out->data = NULL;
- block_ctx_out->bh = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
if (NULL != block_ctx_out->dev) {
return 0;
} else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
{
- if (NULL != block_ctx->bh) {
- brelse(block_ctx->bh);
- block_ctx->bh = NULL;
+ if (block_ctx->mem_to_free) {
+ unsigned int num_pages;
+
+ BUG_ON(!block_ctx->datav);
+ BUG_ON(!block_ctx->pagev);
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ while (num_pages > 0) {
+ num_pages--;
+ if (block_ctx->datav[num_pages]) {
+ kunmap(block_ctx->pagev[num_pages]);
+ block_ctx->datav[num_pages] = NULL;
+ }
+ if (block_ctx->pagev[num_pages]) {
+ __free_page(block_ctx->pagev[num_pages]);
+ block_ctx->pagev[num_pages] = NULL;
+ }
+ }
+
+ kfree(block_ctx->mem_to_free);
+ block_ctx->mem_to_free = NULL;
+ block_ctx->pagev = NULL;
+ block_ctx->datav = NULL;
}
}
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx)
{
- block_ctx->bh = NULL;
- if (block_ctx->dev_bytenr & 4095) {
+ unsigned int num_pages;
+ unsigned int i;
+ u64 dev_bytenr;
+ int ret;
+
+ BUG_ON(block_ctx->datav);
+ BUG_ON(block_ctx->pagev);
+ BUG_ON(block_ctx->mem_to_free);
+ if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: read_block() with unaligned bytenr %llu\n",
(unsigned long long)block_ctx->dev_bytenr);
return -1;
}
- if (block_ctx->len > 4096) {
- printk(KERN_INFO
- "btrfsic: read_block() with too huge size %d\n",
- block_ctx->len);
+
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+ sizeof(*block_ctx->pagev)) *
+ num_pages, GFP_NOFS);
+ if (!block_ctx->mem_to_free)
return -1;
+ block_ctx->datav = block_ctx->mem_to_free;
+ block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+ if (!block_ctx->pagev[i])
+ return -1;
}
- block_ctx->bh = __bread(block_ctx->dev->bdev,
- block_ctx->dev_bytenr >> 12, 4096);
- if (NULL == block_ctx->bh)
- return -1;
- block_ctx->data = block_ctx->bh->b_data;
+ dev_bytenr = block_ctx->dev_bytenr;
+ for (i = 0; i < num_pages;) {
+ struct bio *bio;
+ unsigned int j;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ bio = bio_alloc(GFP_NOFS, num_pages - i);
+ if (!bio) {
+ printk(KERN_INFO
+ "btrfsic: bio_alloc() for %u pages failed!\n",
+ num_pages - i);
+ return -1;
+ }
+ bio->bi_bdev = block_ctx->dev->bdev;
+ bio->bi_sector = dev_bytenr >> 9;
+ bio->bi_end_io = btrfsic_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ for (j = i; j < num_pages; j++) {
+ ret = bio_add_page(bio, block_ctx->pagev[j],
+ PAGE_CACHE_SIZE, 0);
+ if (PAGE_CACHE_SIZE != ret)
+ break;
+ }
+ if (j == i) {
+ printk(KERN_INFO
+ "btrfsic: error, failed to add a single page!\n");
+ return -1;
+ }
+ submit_bio(READ, bio);
+
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk(KERN_INFO
+ "btrfsic: read error at logical %llu dev %s!\n",
+ block_ctx->start, block_ctx->dev->name);
+ bio_put(bio);
+ return -1;
+ }
+ bio_put(bio);
+ dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ i = j;
+ }
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ if (!block_ctx->datav[i]) {
+ printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+ block_ctx->dev->name);
+ return -1;
+ }
+ }
return block_ctx->len;
}
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
static void btrfsic_dump_database(struct btrfsic_state *state)
{
struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
* (note that this test fails for the super block)
*/
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- const u8 *data, unsigned int size)
+ char **datav, unsigned int num_pages)
{
struct btrfs_header *h;
u8 csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
+ unsigned int i;
- h = (struct btrfs_header *)data;
+ if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ return 1; /* not metadata */
+ num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ h = (struct btrfs_header *)datav[0];
if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
- fail++;
+ return 1;
+
+ for (i = 0; i < num_pages; i++) {
+ u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_CACHE_SIZE :
+ (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
- crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ crc = crc32c(crc, data, sublen);
+ }
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
- crc_fail++;
+ return 1;
- return fail || crc_fail;
+ return 0; /* is metadata */
}
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr,
- u8 *mapped_data, unsigned int len,
- struct bio *bio,
- int *bio_is_patched,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw)
{
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
int ret;
struct btrfsic_state *state = dev_state->state;
struct block_device *bdev = dev_state->bdev;
+ unsigned int processed_len;
- WARN_ON(len > PAGE_SIZE);
- is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
if (NULL != bio_is_patched)
*bio_is_patched = 0;
+again:
+ if (num_pages == 0)
+ return;
+
+ processed_len = 0;
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+ num_pages));
+
block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
&state->block_hashtable);
if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (block->is_superblock) {
bytenr = le64_to_cpu(((struct btrfs_super_block *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
+ if (num_pages * PAGE_CACHE_SIZE <
+ BTRFS_SUPER_INFO_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
is_metadata = 1;
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
if (is_metadata) {
if (!block->is_superblock) {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->metablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->metablock_size;
bytenr = le64_to_cpu(((struct btrfs_header *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
dev_state,
- dev_bytenr,
- mapped_data);
+ dev_bytenr);
}
if (block->logical_bytenr != bytenr) {
printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->mirror_num,
btrfsic_get_block_type(state, block));
} else {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->datablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
le64_to_cpu(block->disk_key.offset),
(unsigned long long)
le64_to_cpu(((struct btrfs_header *)
- mapped_data)->generation),
+ mapped_datav[0])->generation),
(unsigned long long)
state->max_superblock_generation);
btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
(unsigned long long)block->generation,
(unsigned long long)
le64_to_cpu(((struct btrfs_header *)
- mapped_data)->generation));
+ mapped_datav[0])->generation));
/* it would not be safe to go on */
btrfsic_dump_tree(state);
- return;
+ goto continue_loop;
}
/*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
if (block->is_superblock)
- ret = btrfsic_map_superblock(state, bytenr, len,
+ ret = btrfsic_map_superblock(state, bytenr,
+ processed_len,
bdev, &block_ctx);
else
- ret = btrfsic_map_block(state, bytenr, len,
+ ret = btrfsic_map_block(state, bytenr, processed_len,
&block_ctx, 0);
if (ret) {
printk(KERN_INFO
"btrfsic: btrfsic_map_block(root @%llu)"
" failed!\n", (unsigned long long)bytenr);
- return;
+ goto continue_loop;
}
- block_ctx.data = mapped_data;
+ block_ctx.datav = mapped_datav;
/* the following is required in case of writes to mirrors,
* use the same that was used for the lookup */
block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->logical_bytenr = bytenr;
block->is_metadata = 1;
if (block->is_superblock) {
+ BUG_ON(PAGE_CACHE_SIZE !=
+ BTRFS_SUPER_INFO_SIZE);
ret = btrfsic_process_written_superblock(
state,
block,
(struct btrfs_super_block *)
- mapped_data);
+ mapped_datav[0]);
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
state,
block,
&block_ctx,
- (struct btrfs_header *)
- block_ctx.data,
0, 0);
}
if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 bytenr;
if (!is_metadata) {
+ processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO "Written block (%s/%llu/?)"
" !found in hash table, D.\n",
dev_state->name,
(unsigned long long)dev_bytenr);
- if (!state->include_extent_data)
- return; /* ignore that written D block */
+ if (!state->include_extent_data) {
+ /* ignore that written D block */
+ goto continue_loop;
+ }
/* this is getting ugly for the
* include_extent_data case... */
bytenr = 0; /* unknown */
block_ctx.start = bytenr;
- block_ctx.len = len;
- block_ctx.bh = NULL;
+ block_ctx.len = processed_len;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.pagev = NULL;
} else {
+ processed_len = state->metablock_size;
bytenr = le64_to_cpu(((struct btrfs_header *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
- dev_bytenr,
- mapped_data);
+ dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
dev_state->name,
(unsigned long long)dev_bytenr);
- ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
- 0);
+ ret = btrfsic_map_block(state, bytenr, processed_len,
+ &block_ctx, 0);
if (ret) {
printk(KERN_INFO
"btrfsic: btrfsic_map_block(root @%llu)"
" failed!\n",
(unsigned long long)dev_bytenr);
- return;
+ goto continue_loop;
}
}
- block_ctx.data = mapped_data;
+ block_ctx.datav = mapped_datav;
/* the following is required in case of writes to mirrors,
* use the same that was used for the lookup */
block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (NULL == block) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
- return;
+ goto continue_loop;
}
block->dev_state = dev_state;
block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (is_metadata) {
ret = btrfsic_process_metablock(state, block,
- &block_ctx,
- (struct btrfs_header *)
- block_ctx.data, 0, 0);
+ &block_ctx, 0, 0);
if (ret)
printk(KERN_INFO
"btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
btrfsic_release_block_ctx(&block_ctx);
}
+
+continue_loop:
+ BUG_ON(!processed_len);
+ dev_bytenr += processed_len;
+ mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+ num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ goto again;
}
static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
printk(KERN_INFO
"btrfsic_process_written_superblock("
"mirror_num=%d)\n", mirror_num);
- ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE,
&tmp_next_block_ctx,
mirror_num);
if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
u64 bytenr,
struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char *data)
+ u64 dev_bytenr)
{
int num_copies;
int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
int match = 0;
num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
- bytenr, PAGE_SIZE);
+ bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, bytenr, state->metablock_size,
&block_ctx, mirror_num);
if (ret) {
printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
(unsigned long long)bytenr, dev_state->name,
(unsigned long long)dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, bytenr,
+ state->metablock_size,
&block_ctx, mirror_num);
if (ret)
continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
(unsigned long)bh->b_size, bh->b_data,
bh->b_bdev);
btrfsic_process_written_block(dev_state, dev_bytenr,
- bh->b_data, bh->b_size, NULL,
+ &bh->b_data, 1, NULL,
NULL, bh, rw);
} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+ "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
rw, bh->b_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
unsigned int i;
u64 dev_bytenr;
int bio_is_patched;
+ char **mapped_datav;
dev_bytenr = 512 * bio->bi_sector;
bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
(unsigned long long)dev_bytenr,
bio->bi_bdev);
+ mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+ GFP_NOFS);
+ if (!mapped_datav)
+ goto leave;
for (i = 0; i < bio->bi_vcnt; i++) {
- u8 *mapped_data;
-
- mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+ if (!mapped_datav[i]) {
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ goto leave;
+ }
if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE) ==
(dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
printk(KERN_INFO
- "#%u: page=%p, mapped=%p, len=%u,"
- " offset=%u\n",
+ "#%u: page=%p, len=%u, offset=%u\n",
i, bio->bi_io_vec[i].bv_page,
- mapped_data,
bio->bi_io_vec[i].bv_len,
bio->bi_io_vec[i].bv_offset);
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_data,
- bio->bi_io_vec[i].bv_len,
- bio, &bio_is_patched,
- NULL, rw);
+ }
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_datav, bio->bi_vcnt,
+ bio, &bio_is_patched,
+ NULL, rw);
+ while (i > 0) {
+ i--;
kunmap(bio->bi_io_vec[i].bv_page);
- dev_bytenr += bio->bi_io_vec[i].bv_len;
}
+ kfree(mapped_datav);
} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+ "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
rw, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
bio->bi_end_io = btrfsic_bio_end_io;
}
}
+leave:
mutex_unlock(&btrfsic_mutex);
submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
+ if (root->nodesize != root->leafsize) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+ root->nodesize, root->leafsize);
+ return -1;
+ }
+ if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
state = kzalloc(sizeof(*state), GFP_NOFS);
if (NULL == state) {
printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
state->csum_size = 0;
+ state->metablock_size = root->nodesize;
+ state->datablock_size = root->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
btrfsic_block_hashtable_init(&state->block_hashtable);
btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
btrfsic_block_link_free(l);
}
- if (b_all->is_iodone)
+ if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4106264fbc6..d7a96cfdc50 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot);
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid,
+ u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 time_seq);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
new_root_objectid, &disk_key, level,
- buf->start, 0, 1);
+ buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return 0;
}
+enum mod_log_op {
+ MOD_LOG_KEY_REPLACE,
+ MOD_LOG_KEY_ADD,
+ MOD_LOG_KEY_REMOVE,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ MOD_LOG_MOVE_KEYS,
+ MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+ int dst_slot;
+ int nr_items;
+};
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 index; /* shifted logical */
+ struct seq_list elem;
+ enum mod_log_op op;
+
+ /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+ int slot;
+
+ /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+ u64 generation;
+
+ /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* this is used for op == MOD_LOG_MOVE_KEYS */
+ struct tree_mod_move move;
+
+ /* this is used for op == MOD_LOG_ROOT_REPLACE */
+ struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+ elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ elem->flags = 1;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ __get_tree_mod_seq(fs_info, elem);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct seq_list *cur_elem;
+ struct tree_mod_elem *tm;
+ u64 min_seq = (u64)-1;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ BUG_ON(!(elem->flags & 1));
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ list_del(&elem->list);
+
+ list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+ if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+ if (seq_putting > cur_elem->seq) {
+ /*
+ * blocker with lower sequence number exists, we
+ * cannot remove anything from the log
+ */
+ goto out;
+ }
+ min_seq = cur_elem->seq;
+ }
+ }
+
+ /*
+ * anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = container_of(node, struct tree_mod_elem, node);
+ if (tm->elem.seq > min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ list_del(&tm->elem.list);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+out:
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ * index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+ int ret = 0;
+
+ BUG_ON(!tm || !tm->elem.seq);
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = container_of(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->index < tm->index)
+ new = &((*new)->rb_left);
+ else if (cur->index > tm->index)
+ new = &((*new)->rb_right);
+ else if (cur->elem.seq < tm->elem.seq)
+ new = &((*new)->rb_left);
+ else if (cur->elem.seq > tm->elem.seq)
+ new = &((*new)->rb_right);
+ else {
+ kfree(tm);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+unlock:
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb) {
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 1;
+ if (!eb)
+ return 0;
+ if (btrfs_header_level(eb) == 0)
+ return 1;
+ return 0;
+}
+
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+ struct tree_mod_elem **tm_ret)
+{
+ struct tree_mod_elem *tm;
+ int seq;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return 0;
+
+ tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return -ENOMEM;
+
+ tm->elem.flags = 0;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ /*
+ * someone emptied the list while we were waiting for the lock.
+ * we must not add to the list, because no blocker exists. items
+ * are removed from the list only when the existing blocker is
+ * removed from the list.
+ */
+ kfree(tm);
+ seq = 0;
+ } else {
+ __get_tree_mod_seq(fs_info, &tm->elem);
+ seq = tm->elem.seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ if (op != MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+ return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ int slot, enum mod_log_op op)
+{
+ return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int dst_slot, int src_slot,
+ int nr_items, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return 0;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+ BUG_ON(ret < 0);
+ }
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = MOD_LOG_MOVE_KEYS;
+
+ return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *old_root,
+ struct extent_buffer *new_root, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = MOD_LOG_ROOT_REPLACE;
+
+ return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+ int smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+ u64 index = start >> PAGE_CACHE_SHIFT;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = container_of(node, struct tree_mod_elem, node);
+ if (cur->index < index) {
+ node = node->rb_left;
+ } else if (cur->index > index) {
+ node = node->rb_right;
+ } else if (cur->elem.seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* we want the node with the highest seq */
+ if (found)
+ BUG_ON(found->elem.seq > cur->elem.seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->elem.seq > min_seq) {
+ /* we want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->elem.seq < cur->elem.seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+ u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ struct extent_buffer *src, unsigned long dst_offset,
+ unsigned long src_offset, int nr_items)
+{
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return;
+
+ /* speed this up by single seq for all operations? */
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
+ ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ int dst_offset, int src_offset, int nr_items)
+{
+ int ret;
+ ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+ nr_items, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+ int ret;
+
+ ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+ MOD_LOG_KEY_REPLACE,
+ atomic ? GFP_ATOMIC : GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ int i;
+ int ret;
+ u32 nritems;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+ struct extent_buffer *new_root_node)
+{
+ int ret;
+ tree_mod_log_free_eb(root->fs_info, root->node);
+ ret = tree_mod_log_insert_root(root->fs_info, root->node,
+ new_root_node, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
/*
* check if the tree block can be shared by multiple trees
*/
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
BUG_ON(ret); /* -ENOMEM */
}
+ /*
+ * don't log freeing in case we're freeing the root node, this
+ * is done by tree_mod_log_set_root_pointer later
+ */
+ if (buf != root->node && btrfs_header_level(buf) != 0)
+ tree_mod_log_free_eb(root->fs_info, buf);
clean_tree_block(trans, root, buf);
*last_ref = 1;
}
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
root->root_key.objectid, &disk_key,
- level, search_start, empty_size, 1);
+ level, search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
extent_buffer_get(cow);
+ tree_mod_log_set_root_pointer(root, cow);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref, 1);
+ last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
WARN_ON(trans->transid != btrfs_header_generation(parent));
+ tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+ MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref, 1);
+ last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = root->node->start;
+ int looped = 0;
+
+ if (!time_seq)
+ return 0;
+
+ /*
+ * the very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). this has the index of the *new*
+ * root, making it the very first operation that's logged for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return 0;
+ /*
+ * we must have key remove operations in the log before the
+ * replace operation.
+ */
+ BUG_ON(!tm);
+
+ if (tm->op != MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ BUG_ON(root_logical == root->node->start);
+ looped = 1;
+ }
+
+ return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ while (tm && tm->elem.seq >= time_seq) {
+ /*
+ * all the operations are recorded with the operator used for
+ * the modification. as we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case MOD_LOG_KEY_ADD:
+ if (tm->slot != n - 1) {
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
+ memmove_extent_buffer(eb, o_dst, o_src, p_size);
+ }
+ n--;
+ break;
+ case MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case MOD_LOG_ROOT_REPLACE:
+ /*
+ * this operation is special. for roots, this must be
+ * handled explicitly before rewinding.
+ * for non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. in the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. we simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = container_of(next, struct tree_mod_elem, node);
+ if (tm->index != first_tm->index)
+ break;
+ }
+ btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(eb->start,
+ fs_info->tree_root->nodesize);
+ BUG_ON(!eb_rewin);
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ BUG_ON(!eb_rewin);
+ }
+
+ extent_buffer_get(eb_rewin);
+ free_extent_buffer(eb);
+
+ __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+ return eb_rewin;
+}
+
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb;
+ struct tree_mod_root *old_root;
+ u64 old_generation;
+
+ tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ if (!tm)
+ return root->node;
+
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+
+ tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
+ /*
+ * there was an item in the log when __tree_mod_log_oldest_root
+ * returned. this one must not go away, because the time_seq passed to
+ * us must be blocking its removal.
+ */
+ BUG_ON(!tm);
+
+ if (old_root->logical == root->node->start) {
+ /* there are logged operations for the current root */
+ eb = btrfs_clone_extent_buffer(root->node);
+ } else {
+ /* there's a root replace operation for the current root */
+ eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
+ root->nodesize);
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, root->root_key.objectid);
+ }
+ if (!eb)
+ return NULL;
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ __tree_mod_log_rewind(eb, time_seq, tm);
+
+ return eb;
+}
+
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur)
return -EIO;
} else if (!uptodate) {
- btrfs_read_buffer(cur, gen);
+ err = btrfs_read_buffer(cur, gen);
+ if (err) {
+ free_extent_buffer(cur);
+ return err;
+ }
}
}
if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot)
{
- if (level == 0) {
+ if (level == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item),
key, btrfs_header_nritems(eb),
slot);
- } else {
+ else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
sizeof(struct btrfs_key_ptr),
key, btrfs_header_nritems(eb),
slot);
- }
- return -1;
}
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
+ tree_mod_log_set_root_pointer(root, child);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1);
+ del_ptr(trans, root, path, level + 1, pslot + 1, 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &right_key, pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot);
+ del_ptr(trans, root, path, level + 1, pslot, 1);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1496,7 +2158,7 @@ static int
read_block_for_search(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot,
- struct btrfs_key *key)
+ struct btrfs_key *key, u64 time_seq)
{
u64 blocknr;
u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
}
err = read_block_for_search(trans, root, p,
- &b, level, slot, key);
+ &b, level, slot, key, 0);
if (err == -EAGAIN)
goto again;
if (err) {
@@ -1922,6 +2584,115 @@ done:
}
/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (p->search_commit_root) {
+ BUG_ON(time_seq);
+ return btrfs_search_slot(NULL, root, key, p, 0, 0);
+ }
+
+again:
+ b = get_old_root(root, time_seq);
+ extent_buffer_get(b);
+ level = btrfs_header_level(b);
+ btrfs_tree_read_lock(b);
+ p->locks[level] = BTRFS_READ_LOCK;
+
+ while (b) {
+ level = btrfs_header_level(b);
+ p->nodes[level] = b;
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+
+ ret = bin_search(b, key, level, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(NULL, root, p, &b, level,
+ slot, key, time_seq);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ err = btrfs_try_tree_read_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
+ b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+ if (b != p->nodes[level]) {
+ btrfs_tree_unlock_rw(p->nodes[level],
+ p->locks[level]);
+ p->locks[level] = 0;
+ p->nodes[level] = b;
+ }
+ } else {
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0)
+ btrfs_release_path(p);
+
+ return ret;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
+ tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
+ tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+ src_nritems - push_items);
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
+ tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ src_nritems - push_items, push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid, &lower_key,
- level, root->node->start, 0, 0);
+ level, root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
+ tree_mod_log_set_root_pointer(root, c);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
static void insert_ptr(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
- int slot, int level)
+ int slot, int level, int tree_mod_log)
{
struct extent_buffer *lower;
int nritems;
+ int ret;
BUG_ON(!path->nodes[level]);
btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
+ if (tree_mod_log && level)
+ tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+ slot, nritems - slot);
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
+ if (tree_mod_log && level) {
+ ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
btrfs_set_node_key(lower, key, slot);
btrfs_set_node_blockptr(lower, slot, bytenr);
WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid,
- &disk_key, level, c->start, 0, 0);
+ &disk_key, level, c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
-
+ tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(split);
insert_ptr(trans, root, path, &disk_key, split->start,
- path->slots[level + 1] + 1, level + 1);
+ path->slots[level + 1] + 1, level + 1, 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
+ path->slots[1] + 1, 1, 0);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
- &disk_key, 0, l->start, 0, 0);
+ &disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -3028,7 +3817,7 @@ again:
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
+ path->slots[1] + 1, 1, 0);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
} else {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1], 1);
+ path->slots[1], 1, 0);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* empty a node.
*/
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
+ int ret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
+ if (tree_mod_log && level)
+ tree_mod_log_eb_move(root->fs_info, parent, slot,
+ slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
+ } else if (tree_mod_log && level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
}
+
nritems--;
btrfs_set_header_nritems(parent, nritems);
if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1]);
+ del_ptr(trans, root, path, 1, path->slots[1], 1);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
extent_buffer_get(leaf);
- btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4271,7 +5070,7 @@ again:
next = c;
next_rw_lock = path->locks[level];
ret = read_block_for_search(NULL, root, path, &next, level,
- slot, &key);
+ slot, &key, 0);
if (ret == -EAGAIN)
goto again;
@@ -4308,7 +5107,7 @@ again:
break;
ret = read_block_for_search(NULL, root, path, &next, level,
- 0, &key);
+ 0, &key, 0);
if (ret == -EAGAIN)
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d60..0236d03c673 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_FT_XATTR 8
#define BTRFS_FT_MAX 9
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
u8 csum;
} __attribute__ ((__packed__));
+struct btrfs_dev_stats_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
+ /* this protects tree_mod_seq_list */
+ spinlock_t tree_mod_seq_lock;
+ atomic_t tree_mod_seq;
+ struct list_head tree_mod_seq_list;
+
+ /* this protects tree_mod_log */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+
atomic_t nr_async_submits;
atomic_t async_submit_draining;
atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
struct list_head root_list;
spinlock_t orphan_lock;
- struct list_head orphan_list;
+ atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_BALANCE_ITEM_KEY 248
/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY 249
+
+/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index)
+{
+ u64 val;
+
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size, int for_cow);
+ u64 hint, u64 empty_size);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref, int for_cow);
+ u64 parent, int last_ref);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
@@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+ u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+ if (rootid == BTRFS_FS_TREE_OBJECTID ||
+ (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748d84d..c18d0442ae6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
- if (BTRFS_I(inode)->delalloc_meta_reserved) {
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
spin_unlock(&BTRFS_I(inode)->lock);
release = true;
goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
btrfs_set_stack_inode_generation(inode_item,
BTRFS_I(inode)->generation);
- btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+ btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+ inode->i_version = btrfs_stack_inode_sequence(inode_item);
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3b..13ae7b04790 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (need_ref_seq(for_cow, ref_root))
+ if (is_fstree(ref_root))
seq = inc_delayed_seq(delayed_refs);
ref->seq = seq;
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (need_ref_seq(for_cow, ref_root))
+ if (is_fstree(ref_root))
seq = inc_delayed_seq(delayed_refs);
ref->seq = seq;
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
for_cow);
- if (!need_ref_seq(for_cow, ref_root) &&
+ if (!is_fstree(ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
action, for_cow);
- if (!need_ref_seq(for_cow, ref_root) &&
+ if (!is_fstree(ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d9492..413927fb995 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
-struct seq_list {
- struct list_head list;
- u64 seq;
-};
-
static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
{
assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
u64 seq);
/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
- if (for_cow)
- return 0;
-
- if (rootid == BTRFS_FS_TREE_OBJECTID)
- return 1;
-
- if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
- return 1;
-
- return 0;
-}
-
-/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
*/
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1fe74a2ce1..7ae51decf6d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
- INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0, 0);
+ 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv);
btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
+ fs_info->tree_mod_log = RB_ROOT;
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
- BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ ret = btrfs_init_dev_stats(fs_info);
+ if (ret) {
+ printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+ ret);
+ goto fail_block_groups;
+ }
+
ret = btrfs_init_space_info(fs_info);
if (ret) {
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (uptodate) {
set_buffer_uptodate(bh);
} else {
+ struct btrfs_device *device = (struct btrfs_device *)
+ bh->b_private;
+
printk_ratelimited(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
+ "I/O error on %s\n", device->name);
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
clear_buffer_uptodate(bh);
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
}
unlock_buffer(bh);
put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
set_buffer_uptodate(bh);
lock_buffer(bh);
bh->b_end_io = btrfs_end_buffer_write_sync;
+ bh->b_private = device;
}
/*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
}
if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
+ if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-/* Kill all outstanding I/O */
-void btrfs_abort_devices(struct btrfs_root *root)
-{
- struct list_head *head;
- struct btrfs_device *dev;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- head = &root->fs_info->fs_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- blk_abort_queue(dev->bdev->bd_disk->queue);
- }
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-}
-
void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
-static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
- u64 start, u64 end,
- struct extent_state *state)
-{
- struct super_block *sb = page->mapping->host->i_sb;
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- btrfs_error(fs_info, -EIO,
- "Error occured while writing out btree at %llu", start);
- return -EIO;
-}
-
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
- .writepage_io_failed_hook = btree_writepage_io_failed_hook,
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ab1830aaf0e..05b3fab39f7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
int btrfs_cleanup_transaction(struct btrfs_root *root);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e887ee62b6d..614f34a899c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -13,15 +13,14 @@
parent_root_objectid) / 4)
#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
- int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
struct btrfs_fid *fid = (struct btrfs_fid *)fh;
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type;
- if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
*max_len = BTRFS_FID_SIZE_CONNECTABLE;
return 255;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
+ if (parent) {
u64 parent_root_id;
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
parent_root_id = BTRFS_I(parent)->root->objectid;
- spin_unlock(&dentry->d_lock);
-
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49fd7b66d57..4b5a1e1bdef 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3578,7 +3578,7 @@ again:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
out:
- mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
BTRFS_I(inode)->outstanding_extents--;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
- BTRFS_I(inode)->delalloc_meta_reserved) {
+ test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
drop_inode_space = 1;
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
- }
/*
* If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
* Add an item to reserve for updating the inode when we complete the
* delalloc io.
*/
- if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
nr_extents++;
extra_reserve = 1;
}
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
if (extra_reserve) {
- BTRFS_I(inode)->delalloc_meta_reserved = 1;
+ set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
nr_extents--;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref, int for_cow)
+ u64 parent, int last_ref)
{
struct btrfs_block_group_cache *cache = NULL;
int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BTRFS_DROP_DELAYED_REF, NULL, 0);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size, int for_cow)
+ u64 hint, u64 empty_size)
{
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
ins.objectid,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op, for_cow);
+ extent_op, 0);
BUG_ON(ret); /* -ENOMEM */
}
return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a05036..2c8f7b20461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
return parent;
}
- entry = rb_entry(node, struct tree_entry, rb_node);
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
/*
* utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1)
+ * it will optionally wake up any one waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- clear_state_bit(tree, state, &bits, wake);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
+ state = clear_state_bit(tree, state, &bits, wake);
+ goto next;
}
goto search_again;
}
@@ -781,7 +778,6 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
}
set_state_bits(tree, state, &bits);
-
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -845,6 +836,10 @@ hit_next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -994,21 +989,14 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
-
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -1042,10 +1030,13 @@ hit_next:
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
cached_state, mask);
}
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state,
- gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
* returned if we find something, and *start_ret and *end_ret are
* set to reflect the state struct that was found.
*
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
/* try to remap that extent elsewhere? */
bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
uptodate = 0;
}
- if (!uptodate && tree->ops &&
- tree->ops->writepage_io_failed_hook) {
- ret = tree->ops->writepage_io_failed_hook(NULL, page,
- start, end, NULL);
- /* Writeback already completed */
- if (ret == 0)
- return 1;
- }
-
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
- if (ret)
+ if (ret) {
+ /* no IO indicated but software detected errors
+ * in the block, either checksum errors or
+ * issues with the contents */
+ struct btrfs_root *root =
+ BTRFS_I(page->mapping->host)->root;
+ struct btrfs_device *device;
+
uptodate = 0;
- else
+ device = btrfs_find_device_for_logical(
+ root, start, mirror);
+ if (device)
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ } else {
clean_io_failure(start, page);
+ }
}
if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
u64 offset = eb->start;
unsigned long i, num_pages;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
- int ret;
+ int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
eb->start = start;
eb->len = len;
eb->tree = tree;
+ eb->bflags = 0;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return eb;
}
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+ unsigned long i;
+ struct page *p;
+ struct extent_buffer *new;
+ unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+ new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+ if (new == NULL)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ p = alloc_page(GFP_ATOMIC);
+ BUG_ON(!p);
+ attach_extent_buffer_page(new, p);
+ WARN_ON(PageDirty(p));
+ SetPageUptodate(p);
+ new->pages[i] = p;
+ }
+
+ copy_extent_buffer(new, src, 0, 0, src->len);
+ set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+ set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+ return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+ struct extent_buffer *eb;
+ unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long i;
+
+ eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+ if (!eb)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ eb->pages[i] = alloc_page(GFP_ATOMIC);
+ if (!eb->pages[i])
+ goto err;
+ }
+ set_extent_buffer_uptodate(eb);
+ btrfs_set_header_nritems(eb, 0);
+ set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+ return eb;
+err:
+ for (i--; i > 0; i--)
+ __free_page(eb->pages[i]);
+ __free_extent_buffer(eb);
+ return NULL;
+}
+
static int extent_buffer_under_io(struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long start_idx)
{
unsigned long index;
+ unsigned long num_pages;
struct page *page;
+ int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
BUG_ON(extent_buffer_under_io(eb));
- index = num_extent_pages(eb->start, eb->len);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ index = start_idx + num_pages;
if (start_idx >= index)
return;
do {
index--;
page = extent_buffer_page(eb, index);
- if (page) {
+ if (page && mapped) {
spin_lock(&page->mapping->private_lock);
/*
* We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
}
spin_unlock(&page->mapping->private_lock);
+ }
+ if (page) {
/* One for when we alloced the page */
page_cache_release(page);
}
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
{
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
- struct extent_io_tree *tree = eb->tree;
+ if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ } else {
+ struct extent_io_tree *tree = eb->tree;
- spin_unlock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
- spin_lock(&tree->buffer_lock);
- radix_tree_delete(&tree->buffer,
- eb->start >> PAGE_CACHE_SHIFT);
- spin_unlock(&tree->buffer_lock);
+ spin_lock(&tree->buffer_lock);
+ radix_tree_delete(&tree->buffer,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&tree->buffer_lock);
+ }
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b8dec..25900af5b15 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
#define EXTENT_BUFFER_STALE 6
#define EXTENT_BUFFER_WRITEBACK 7
#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
- int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end,
- struct extent_state *state);
int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53bf2d764bb..70dc8ca73e2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
int cycled;
};
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
+ int ret;
p = &root->fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (defrag->ino < entry->ino)
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
p = &parent->rb_left;
- else if (defrag->ino > entry->ino)
+ else if (ret > 0)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
goto exists;
}
}
- BTRFS_I(inode)->in_defrag = 1;
+ set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (btrfs_fs_closing(root->fs_info))
return 0;
- if (BTRFS_I(inode)->in_defrag)
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
return 0;
if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!BTRFS_I(inode)->in_defrag)
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
__btrfs_add_inode_defrag(inode, defrag);
else
kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
/*
* must be called with the defrag_inodes lock held
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+ u64 root, u64 ino,
struct rb_node **next)
{
struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
struct rb_node *p;
struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
p = info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (ino < entry->ino)
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
p = parent->rb_left;
- else if (ino > entry->ino)
+ else if (ret > 0)
p = parent->rb_right;
else
return entry;
}
if (next) {
- while (parent && ino > entry->ino) {
+ while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
entry = rb_entry(parent, struct inode_defrag, rb_node);
}
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
u64 first_ino = 0;
+ u64 root_objectid = 0;
int num_defrag;
int defrag_batch = 1024;
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
n = NULL;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+ defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+ first_ino, &n);
if (!defrag) {
- if (n)
- defrag = rb_entry(n, struct inode_defrag, rb_node);
- else if (first_ino) {
+ if (n) {
+ defrag = rb_entry(n, struct inode_defrag,
+ rb_node);
+ } else if (root_objectid || first_ino) {
+ root_objectid = 0;
first_ino = 0;
continue;
} else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
/* remove it from the rbtree */
first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
goto next;
/* do a chunk of defrag */
- BTRFS_I(inode)->in_defrag = 0;
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
range.start = defrag->last_offset;
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
defrag_batch);
@@ -1404,12 +1433,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- err = btrfs_update_time(file);
+ err = file_update_time(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
}
- BTRFS_I(inode)->sequence++;
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
- if (BTRFS_I(inode)->ordered_data_close) {
- BTRFS_I(inode)->ordered_data_close = 0;
+ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags)) {
btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_btrfs_sync_file(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
mutex_lock(&inode->i_mutex);
- /* we wait first, since the writeback may change the inode */
+ /*
+ * we wait first, since the writeback may change the inode, also wait
+ * ordered range does a filemape_write_and_wait_range which is why we
+ * don't do it above like other file systems.
+ */
root->log_batch++;
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_wait_ordered_range(inode, start, end);
root->log_batch++;
/*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* syncing
*/
smp_mb();
- if (BTRFS_I(inode)->last_trans <=
+ if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 202008ec367..81296c57405 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
return ERR_PTR(-ENOENT);
}
- inode->i_mapping->flags &= ~__GFP_FS;
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return inode;
}
@@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *val;
+ __le64 *val;
io_ctl_map_page(io_ctl, 1);
@@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *gen;
+ __le64 *gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
return 0;
}
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries. This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache. So run through the space cache that we just
+ * loaded and merge contiguous entries. This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *e, *prev = NULL;
+ struct rb_node *n;
+
+again:
+ spin_lock(&ctl->tree_lock);
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ e = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!prev)
+ goto next;
+ if (e->bitmap || prev->bitmap)
+ goto next;
+ if (prev->offset + prev->bytes == e->offset) {
+ unlink_free_space(ctl, prev);
+ unlink_free_space(ctl, e);
+ prev->bytes += e->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ link_free_space(ctl, prev);
+ prev = NULL;
+ spin_unlock(&ctl->tree_lock);
+ goto again;
+ }
+next:
+ prev = e;
+ }
+ spin_unlock(&ctl->tree_lock);
+}
+
int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
@@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
io_ctl_drop_pages(&io_ctl);
+ merge_space_tree(ctl);
ret = 1;
out:
io_ctl_free(&io_ctl);
@@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret)
- goto out;
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ceb7b9c9edc..f6ab6f5e635 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
static int btrfs_setsize(struct inode *inode, loff_t newsize);
static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
ret = insert_inline_extent(trans, root, inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
- if (ret) {
+ if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
return ret;
+ } else if (ret == -ENOSPC) {
+ return 1;
}
+
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (btrfs_is_free_space_inode(root, inode))
metadata = 2;
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
- if (ret)
- return ret;
-
if (!(rw & REQ_WRITE)) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ if (ret)
+ return ret;
+
if (bio_flags & EXTENT_BIO_COMPRESSED) {
return btrfs_submit_compressed_read(inode, bio,
mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
+ struct inode *inode = ordered_extent->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
int compress_type = 0;
int ret;
bool nolock;
- ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1);
- if (!ret)
- return 0;
- BUG_ON(!ordered_extent); /* Logic error */
-
nolock = btrfs_is_free_space_inode(root, inode);
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ ret = -EIO;
+ goto out;
+ }
+
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->file_offset,
ordered_extent->len);
}
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ goto out_unlock;
}
add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ goto out_unlock;
}
}
ret = 0;
+out_unlock:
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
btrfs_end_transaction(trans, root);
}
+ if (ret)
+ clear_extent_uptodate(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, NULL, GFP_NOFS);
+
+ /*
+ * This needs to be dont to make sure anybody waiting knows we are done
+ * upating everything for this ordered extent.
+ */
+ btrfs_remove_ordered_extent(inode, ordered_extent);
+
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- return 0;
-out_unlock:
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
- goto out;
+ return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
}
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
+ struct btrfs_workers *workers;
+
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
ClearPagePrivate2(page);
- return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+ end - start + 1, uptodate))
+ return 0;
+
+ ordered_extent->work.func = finish_ordered_fn;
+ ordered_extent->work.flags = 0;
+
+ if (btrfs_is_free_space_inode(root, inode))
+ workers = &root->fs_info->endio_freespace_worker;
+ else
+ workers = &root->fs_info->endio_write_workers;
+ btrfs_queue_worker(workers, &ordered_extent->work);
+
+ return 0;
}
/*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
int ret;
- if (!list_empty(&root->orphan_list) ||
+ if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
spin_lock(&root->orphan_lock);
- if (!list_empty(&root->orphan_list)) {
+ if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
block_rsv = NULL;
}
- if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
#if 0
/*
* For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
insert = 1;
#endif
insert = 1;
+ atomic_dec(&root->orphan_inodes);
}
- if (!BTRFS_I(inode)->orphan_meta_reserved) {
- BTRFS_I(inode)->orphan_meta_reserved = 1;
+ if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
reserve = 1;
- }
spin_unlock(&root->orphan_lock);
/* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+ clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
int ret = 0;
spin_lock(&root->orphan_lock);
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- list_del_init(&BTRFS_I(inode)->i_orphan);
+ if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags))
delete_item = 1;
- }
- if (BTRFS_I(inode)->orphan_meta_reserved) {
- BTRFS_I(inode)->orphan_meta_reserved = 0;
+ if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
release_rsv = 1;
- }
spin_unlock(&root->orphan_lock);
if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
}
- if (release_rsv)
+ if (release_rsv) {
btrfs_orphan_release_metadata(inode);
+ atomic_dec(&root->orphan_inodes);
+ }
return 0;
}
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = PTR_ERR(trans);
goto out;
}
+ printk(KERN_ERR "auto deleting %Lu\n",
+ found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* add this inode to the orphan list so btrfs_orphan_del does
* the proper thing when we hit it
*/
- spin_lock(&root->orphan_lock);
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->orphan_lock);
+ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_version = btrfs_inode_sequence(leaf, inode_item);
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
- btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+ btrfs_set_inode_sequence(leaf, item, inode->i_version);
btrfs_set_inode_transid(leaf, item, trans->transid);
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
goto out;
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(inode);
+ inode_inc_iversion(dir);
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
btrfs_update_inode(trans, root, dir);
out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
* any new writes get down to disk quickly.
*/
if (newsize == 0)
- BTRFS_I(inode)->ordered_data_close = 1;
+ set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags);
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid) {
setattr_copy(inode, attr);
+ inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (root->fs_info->log_root_recovering) {
- BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+ BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags));
goto no_delete;
}
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
- BTRFS_I(inode)->dummy_inode = 1;
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
int ret = 0;
bool nolock = false;
- if (BTRFS_I(inode)->dummy_inode)
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
int ret;
- if (BTRFS_I(inode)->dummy_inode)
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
trans = btrfs_join_transaction(root);
@@ -4431,46 +4475,18 @@ int btrfs_dirty_inode(struct inode *inode)
* This is a copy of file_update_time. We need this so we can return error on
* ENOSPC for updating the inode in the case of file write and mmap writes.
*/
-int btrfs_update_time(struct file *file)
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+ int flags)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct timespec now;
- int ret;
- enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
-
- /* First try to exhaust all avenues to not sync */
- if (IS_NOCMTIME(inode))
- return 0;
-
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_mtime, &now))
- sync_it = S_MTIME;
-
- if (!timespec_equal(&inode->i_ctime, &now))
- sync_it |= S_CTIME;
-
- if (IS_I_VERSION(inode))
- sync_it |= S_VERSION;
-
- if (!sync_it)
- return 0;
-
- /* Finally allowed to write? Takes lock. */
- if (mnt_want_write_file(file))
- return 0;
-
- /* Only change inode inside the lock region */
- if (sync_it & S_VERSION)
+ if (flags & S_VERSION)
inode_inc_iversion(inode);
- if (sync_it & S_CTIME)
- inode->i_ctime = now;
- if (sync_it & S_MTIME)
- inode->i_mtime = now;
- ret = btrfs_dirty_inode(inode);
- if (!ret)
- mark_inode_dirty_sync(inode);
- mnt_drop_write(file->f_path.mnt);
- return ret;
+ if (flags & S_CTIME)
+ inode->i_ctime = *now;
+ if (flags & S_MTIME)
+ inode->i_mtime = *now;
+ if (flags & S_ATIME)
+ inode->i_atime = *now;
+ return btrfs_dirty_inode(inode);
}
/*
@@ -4730,6 +4746,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
+ inode_inc_iversion(parent_inode);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
@@ -4937,6 +4954,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
btrfs_inc_nlink(inode);
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -5903,9 +5921,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
struct btrfs_ordered_extent *ordered = NULL;
- struct extent_state *cached_state = NULL;
u64 ordered_offset = dip->logical_offset;
u64 ordered_bytes = dip->bytes;
int ret;
@@ -5915,73 +5931,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
- ordered_bytes);
+ ordered_bytes, !err);
if (!ret)
goto out_test;
- BUG_ON(!ordered);
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- err = -ENOMEM;
- goto out;
- }
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = btrfs_ordered_update_i_size(inode, 0, ordered);
- if (!ret)
- err = btrfs_update_inode_fallback(trans, root, inode);
- goto out;
- }
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
- ordered->file_offset + ordered->len - 1, 0,
- &cached_state);
-
- if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
- ret = btrfs_mark_extent_written(trans, inode,
- ordered->file_offset,
- ordered->file_offset +
- ordered->len);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
- } else {
- ret = insert_reserved_file_extent(trans, inode,
- ordered->file_offset,
- ordered->start,
- ordered->disk_len,
- ordered->len,
- ordered->len,
- 0, 0, 0,
- BTRFS_FILE_EXTENT_REG);
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered->file_offset, ordered->len);
- if (ret) {
- err = ret;
- WARN_ON(1);
- goto out_unlock;
- }
- }
-
- add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
- ret = btrfs_ordered_update_i_size(inode, 0, ordered);
- if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
- btrfs_update_inode_fallback(trans, root, inode);
- ret = 0;
-out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
- ordered->file_offset + ordered->len - 1,
- &cached_state, GFP_NOFS);
-out:
- btrfs_delalloc_release_metadata(inode, ordered->len);
- btrfs_end_transaction(trans, root);
- ordered_offset = ordered->file_offset + ordered->len;
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
-
+ ordered->work.func = finish_ordered_fn;
+ ordered->work.flags = 0;
+ btrfs_queue_worker(&root->fs_info->endio_write_workers,
+ &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -5990,12 +5947,12 @@ out_test:
if (ordered_offset < dip->logical_offset + dip->bytes) {
ordered_bytes = dip->logical_offset + dip->bytes -
ordered_offset;
+ ordered = NULL;
goto again;
}
out_done:
bio->bi_private = dip->private;
- kfree(dip->csums);
kfree(dip);
/* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6020,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int ret;
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- if (ret)
- goto err;
+
+ if (!write) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto err;
+ }
if (skip_sum)
goto map;
@@ -6485,13 +6445,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
static void btrfs_invalidatepage(struct page *page, unsigned long offset)
{
+ struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
/*
* we have the page locked, so new writeback can't start,
* and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6461,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
+ tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
if (ordered) {
/*
@@ -6522,9 +6482,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
* whoever cleared the private bit is responsible
* for the finish_ordered_io
*/
- if (TestClearPagePrivate2(page)) {
- btrfs_finish_ordered_io(page->mapping->host,
- page_start, page_end);
+ if (TestClearPagePrivate2(page) &&
+ btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
+ PAGE_CACHE_SIZE, 1)) {
+ btrfs_finish_ordered_io(ordered);
}
btrfs_put_ordered_extent(ordered);
cached_state = NULL;
@@ -6576,7 +6537,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (!ret) {
- ret = btrfs_update_time(vma->vm_file);
+ ret = file_update_time(vma->vm_file);
reserved = 1;
}
if (ret) {
@@ -6771,7 +6732,8 @@ static int btrfs_truncate(struct inode *inode)
* using truncate to replace the contents of the file will
* end up with a zero length file after a crash.
*/
- if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
@@ -6894,7 +6856,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->root = NULL;
ei->space_info = NULL;
ei->generation = 0;
- ei->sequence = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
@@ -6909,11 +6870,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->outstanding_extents = 0;
ei->reserved_extents = 0;
- ei->ordered_data_close = 0;
- ei->orphan_meta_reserved = 0;
- ei->dummy_inode = 0;
- ei->in_defrag = 0;
- ei->delalloc_meta_reserved = 0;
+ ei->runtime_flags = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL;
@@ -6927,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->log_mutex);
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
- INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6928,12 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- spin_lock(&root->orphan_lock);
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
(unsigned long long)btrfs_ino(inode));
- list_del_init(&BTRFS_I(inode)->i_orphan);
+ atomic_dec(&root->orphan_inodes);
}
- spin_unlock(&root->orphan_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7148,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
btrfs_add_ordered_operation(trans, root, old_inode);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
@@ -7219,6 +7177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new_inode) {
+ inode_inc_iversion(new_inode);
new_inode->i_ctime = CURRENT_TIME;
if (unlikely(btrfs_ino(new_inode) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7449,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_offset += ins.offset;
*alloc_hint = ins.objectid + ins.offset;
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7647,6 +7607,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
@@ -7657,6 +7618,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
@@ -7670,6 +7632,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14f8e1faa46..24b776c08d9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
btrfs_update_iflags(inode);
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0, 0);
+ 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
di_args->bytes_used = dev->bytes_used;
di_args->total_bytes = dev->total_bytes;
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name)
+ if (dev->name) {
strncpy(di_args->path, dev->name, sizeof(di_args->path));
- else
+ di_args->path[sizeof(di_args->path) - 1] = 0;
+ } else {
di_args->path[0] = '\0';
+ }
out:
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
- user_dest = (struct btrfs_ioctl_space_info *)
+ user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
return ret;
}
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+ void __user *arg, int reset_after_read)
+{
+ struct btrfs_ioctl_get_dev_stats *sa;
+ int ret;
+
+ if (reset_after_read && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
}
}
-static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -3291,6 +3322,7 @@ out_bargs:
out:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+ mnt_drop_write(file->f_path.mnt);
return ret;
}
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(root, NULL);
+ return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
case BTRFS_IOC_BALANCE_V2:
- return btrfs_ioctl_balance(root, argp);
+ return btrfs_ioctl_balance(file, argp);
case BTRFS_IOC_BALANCE_CTL:
return btrfs_ioctl_balance_ctl(root, arg);
case BTRFS_IOC_BALANCE_PROGRESS:
return btrfs_ioctl_balance_progress(root, argp);
+ case BTRFS_IOC_GET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(root, argp, 0);
+ case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(root, argp, 1);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 086e6bdae1c..497c530724c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
__u64 inodes;
};
+enum btrfs_dev_stat_values {
+ /* disk I/O failure stats */
+ BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
+ BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
+ BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
+
+ /* stats for indirect indications for I/O failures */
+ BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
+ * contents is illegal: this is an
+ * indication that the block was damaged
+ * during read or write, or written to
+ * wrong location or read from wrong
+ * location */
+ BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
+ * been written */
+
+ BTRFS_DEV_STAT_VALUES_MAX
+};
+
+struct btrfs_ioctl_get_dev_stats {
+ __u64 devid; /* in */
+ __u64 nr_items; /* in/out */
+
+ /* out values: */
+ __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
+
+ __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
+ struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+ struct btrfs_ioctl_get_dev_stats)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aeb..9e138cdc36c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
- entry->inode = inode;
+ entry->inode = igrab(inode);
entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
trace_btrfs_ordered_extent_add(inode, entry);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
ordered_data_tree_panic(inode, -EEXIST, file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
}
/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
*/
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size)
+ u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
+ unsigned long flags;
u64 dec_end;
u64 dec_start;
u64 to_dec;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
(unsigned long long)to_dec);
}
entry->bytes_left -= to_dec;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -332,7 +336,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -347,15 +351,21 @@ out:
*/
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size)
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
}
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
if (!offset_in_entry(entry, file_offset)) {
ret = 1;
goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
(unsigned long long)io_size);
}
entry->bytes_left -= io_size;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -383,7 +397,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* remove an ordered extent from the tree. No references are dropped
- * and you must wake_up entry->wait. You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
*/
-static void __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_unlock_irq(&tree->lock);
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree. No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
-{
- struct btrfs_ordered_inode_tree *tree;
-
- tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
- __btrfs_remove_ordered_extent(inode, entry);
- spin_unlock(&tree->lock);
wake_up(&entry->wait);
}
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (orig_end > INT_LIMIT(loff_t))
orig_end = INT_LIMIT(loff_t);
}
-again:
+
/* start IO across the range first to instantiate any delalloc
* extents
*/
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- /* The compression code will leave pages locked but return from
- * writepage without setting the page writeback. Starting again
- * with WB_SYNC_ALL will end up waiting for the IO to actually start.
- */
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
end = orig_end;
found = 0;
@@ -657,11 +651,6 @@ again:
break;
end--;
}
- if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_DELALLOC, 0, NULL)) {
- schedule_timeout(1);
- goto again;
- }
}
/*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (entry)
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
out:
if (entry)
atomic_inc(&entry->refs);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered)
{
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 disk_i_size;
u64 new_i_size;
u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
disk_i_size = BTRFS_I(inode)->disk_i_size;
/* truncate file */
@@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
/*
- * we can't update the disk_isize if there are delalloc bytes
- * between disk_i_size and this ordered extent
- */
- if (test_range_bit(io_tree, disk_i_size, offset - 1,
- EXTENT_DELALLOC, 0, NULL)) {
- goto out;
- }
- /*
* walk backward from this ordered extent to disk_i_size.
* if we find an ordered extent then we can't update disk i_size
* yet
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
node = prev;
}
- while (node) {
+ for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ /* We treat this entry as if it doesnt exist */
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
if (test->file_offset + test->len <= disk_i_size)
break;
if (test->file_offset >= i_size)
break;
if (test->file_offset >= disk_i_size)
goto out;
- node = rb_prev(node);
}
new_i_size = min_t(u64, offset, i_size);
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
node = rb_first(&tree->tree);
}
- i_size_test = 0;
- if (node) {
- /*
- * do we have an area where IO might have finished
- * between our ordered extent and the next one.
- */
+
+ /*
+ * We are looking for an area between our current extent and the next
+ * ordered extent to update the i_size to. There are 3 cases here
+ *
+ * 1) We don't actually have anything and we can update to i_size.
+ * 2) We have stuff but they already did their i_size update so again we
+ * can just update to i_size.
+ * 3) We have an outstanding ordered extent so the most we can update
+ * our disk_i_size to is the start of the next offset.
+ */
+ i_size_test = i_size;
+ for (; node; node = rb_next(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > offset)
+
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
+ if (test->file_offset > offset) {
i_size_test = test->file_offset;
- } else {
- i_size_test = i_size;
+ break;
+ }
}
/*
* i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents. As long as there
- * are no delalloc bytes in this area, it is safe to update
- * disk_i_size to the end of the region.
+ * extent where there are no ordered extents, we can safely set
+ * disk_i_size to this.
*/
- if (i_size_test > offset &&
- !test_range_bit(io_tree, offset, i_size_test - 1,
- EXTENT_DELALLOC, 0, NULL)) {
+ if (i_size_test > offset)
new_i_size = min_t(u64, i_size_test, i_size);
- }
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
/*
- * we need to remove the ordered extent with the tree lock held
- * so that other people calling this function don't find our fully
- * processed ordered entry and skip updating the i_size
+ * We need to do this because we can't remove ordered extents until
+ * after the i_disk_size has been updated and then the inode has been
+ * updated to reflect the change, so we need to tell anybody who finds
+ * this ordered extent that we've already done all the real work, we
+ * just haven't completed all the other work.
*/
if (ordered)
- __btrfs_remove_ordered_extent(inode, ordered);
- spin_unlock(&tree->lock);
- if (ordered)
- wake_up(&ordered->wait);
+ set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+ spin_unlock_irq(&tree->lock);
return ret;
}
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
if (!ordered)
return 1;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
}
}
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
btrfs_put_ordered_extent(ordered);
return ret;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c355ad4dc1a..e03c560d299 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+ * has done its due diligence in updating
+ * the isize. */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
/* a per root list of all the pending ordered extents */
struct list_head root_extent_list;
+
+ struct btrfs_work work;
};
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size);
+ u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size);
+ u64 *file_offset, u64 io_size,
+ int uptodate);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f38e452486b..5e23684887e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_dev_extent_chunk_offset(l, dev_extent),
(unsigned long long)
btrfs_dev_extent_length(l, dev_extent));
+ case BTRFS_DEV_STATS_KEY:
+ printk(KERN_INFO "\t\tdevice stats\n");
+ break;
};
}
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d0108588..48a4882d8ad 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
struct btrfs_fs_info *fs_info;
+ int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
fs_info = rmw->fs_info;
kfree(rmw);
+ old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+ task_nice_ioprio(current));
+ set_task_ioprio(current, BTRFS_IOPRIO_READA);
__reada_start_machine(fs_info);
+ set_task_ioprio(current, old_ioprio);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2f3d6f917fb..a38cfa4f251 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -50,7 +50,7 @@ struct scrub_dev;
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
- struct block_device *bdev;
+ struct btrfs_device *dev;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
@@ -86,6 +86,7 @@ struct scrub_block {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
+ unsigned int generation_error:1; /* also sets header_error */
};
};
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sdev->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("i/o error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sdev->stat_lock);
sdev->stat.csum_errors++;
spin_unlock(&sdev->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
spin_lock(&sdev->stat_lock);
sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (__ratelimit(&_rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
+ if (sblock_bad->generation_error)
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
page = sblock->pagev + page_index;
page->logical = logical;
page->physical = bbio->stripes[mirror_index].physical;
- /* for missing devices, bdev is NULL */
- page->bdev = bbio->stripes[mirror_index].dev->bdev;
+ /* for missing devices, dev->bdev is NULL */
+ page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
page->page = alloc_page(GFP_NOFS);
if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_page *page = sblock->pagev + page_num;
DECLARE_COMPLETION_ONSTACK(complete);
- if (page->bdev == NULL) {
+ if (page->dev->bdev == NULL) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
- bio->bi_bdev = page->bdev;
+ bio->bi_bdev = page->dev->bdev;
bio->bi_sector = page->physical >> 9;
bio->bi_end_io = scrub_complete_bio_end_io;
bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
- generation != le64_to_cpu(h->generation) ||
memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE))
+ BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
+ } else if (generation != le64_to_cpu(h->generation)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
csum = h->csum;
} else {
if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
- bio->bi_bdev = page_bad->bdev;
+ bio->bi_bdev = page_bad->dev->bdev;
bio->bi_sector = page_bad->physical >> 9;
bio->bi_end_io = scrub_complete_bio_end_io;
bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
/* this will also unplug the queue */
wait_for_completion(&complete);
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
+ btrfs_dev_stat_inc_and_print(page_bad->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ bio_put(bio);
+ return -EIO;
+ }
bio_put(bio);
}
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
+ int fail_gen = 0;
+ int fail_cor = 0;
u64 len;
int index;
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
memcpy(on_disk_csum, s->csum, sdev->csum_size);
if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
- ++fail;
+ ++fail_cor;
if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
- ++fail;
+ ++fail_gen;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
- ++fail;
+ ++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
- ++fail;
+ ++fail_cor;
- if (fail) {
+ if (fail_cor + fail_gen) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
spin_lock(&sdev->stat_lock);
++sdev->stat.super_errors;
spin_unlock(&sdev->stat_lock);
+ if (fail_cor)
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
}
- return fail;
+ return fail_cor + fail_gen;
}
static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
return -ENOMEM;
}
spage->sblock = sblock;
- spage->bdev = sdev->dev->bdev;
+ spage->dev = sdev->dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5f8fca4195..96eb9fef7bd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_start(args, fmt);
if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
- strncpy(lvl, fmt, 3);
+ memcpy(lvl, fmt, 3);
+ lvl[3] = '\0';
fmt += 3;
type = logtypes[fmt[1] - '0'];
} else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_thread_pool:
intarg = 0;
match_int(&args[0], &intarg);
- if (intarg) {
+ if (intarg)
info->thread_pool_size = intarg;
- printk(KERN_INFO "btrfs: thread pool %d\n",
- info->thread_pool_size);
- }
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
-
+ sb->s_flags |= MS_I_VERSION;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
*/
static char *setup_root_args(char *args)
{
- unsigned copied = 0;
- unsigned len = strlen(args) + 2;
- char *pos;
- char *ret;
+ unsigned len = strlen(args) + 2 + 1;
+ char *src, *dst, *buf;
/*
- * We need the same args as before, but minus
- *
- * subvol=a
- *
- * and add
- *
- * subvolid=0
+ * We need the same args as before, but with this substitution:
+ * s!subvol=[^,]+!subvolid=0!
*
- * which is a difference of 2 characters, so we allocate strlen(args) +
- * 2 characters.
+ * Since the replacement string is up to 2 bytes longer than the
+ * original, allocate strlen(args) + 2 + 1 bytes.
*/
- ret = kzalloc(len * sizeof(char), GFP_NOFS);
- if (!ret)
- return NULL;
- pos = strstr(args, "subvol=");
+ src = strstr(args, "subvol=");
/* This shouldn't happen, but just in case.. */
- if (!pos) {
- kfree(ret);
+ if (!src)
+ return NULL;
+
+ buf = dst = kmalloc(len, GFP_NOFS);
+ if (!buf)
return NULL;
- }
/*
- * The subvol=<> arg is not at the front of the string, copy everybody
- * up to that into ret.
+ * If the subvol= arg is not at the start of the string,
+ * copy whatever precedes it into buf.
*/
- if (pos != args) {
- *pos = '\0';
- strcpy(ret, args);
- copied += strlen(args);
- pos++;
+ if (src != args) {
+ *src++ = '\0';
+ strcpy(buf, args);
+ dst += strlen(args);
}
- strncpy(ret + copied, "subvolid=0", len - copied);
-
- /* Length of subvolid=0 */
- copied += 10;
+ strcpy(dst, "subvolid=0");
+ dst += strlen("subvolid=0");
/*
- * If there is no , after the subvol= option then we know there's no
- * other options and we can just return.
+ * If there is a "," after the original subvol=... string,
+ * copy that suffix into our buffer. Otherwise, we're done.
*/
- pos = strchr(pos, ',');
- if (!pos)
- return ret;
+ src = strchr(src, ',');
+ if (src)
+ strcpy(dst, src);
- /* Copy the rest of the arguments into our buffer */
- strncpy(ret + copied, pos, len - copied);
- copied += strlen(pos);
-
- return ret;
+ return buf;
}
static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
return ERR_PTR(error);
}
+static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
+{
+ spin_lock_irq(&workers->lock);
+ workers->max_workers = new_limit;
+ spin_unlock_irq(&workers->lock);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+ int new_pool_size, int old_pool_size)
+{
+ if (new_pool_size == old_pool_size)
+ return;
+
+ fs_info->thread_pool_size = new_pool_size;
+
+ printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+ old_pool_size, new_pool_size);
+
+ btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+ btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+}
+
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
+ btrfs_resize_thread_pool(fs_info,
+ fs_info->thread_pool_size, old_thread_pool_size);
+
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
@@ -1180,7 +1200,8 @@ restore:
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
fs_info->alloc_start = old_alloc_start;
- fs_info->thread_pool_size = old_thread_pool_size;
+ btrfs_resize_thread_pool(fs_info,
+ old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 36422254ef6..1791c6e3d83 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
#include "locking.h"
#include "tree-log.h"
#include "inode-map.h"
+#include "volumes.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
static noinline int join_transaction(struct btrfs_root *root, int nofail)
{
struct btrfs_transaction *cur_trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- spin_unlock(&root->fs_info->trans_lock);
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
- if (root->fs_info->trans_no_join) {
+ if (fs_info->trans_no_join) {
if (!nofail) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
}
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
- spin_lock(&root->fs_info->trans_lock);
- if (root->fs_info->running_transaction) {
+ spin_lock(&fs_info->trans_lock);
+ if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the trans_no_join checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
goto loop;
}
@@ -121,20 +123,38 @@ loop:
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
cur_trans->delayed_refs.seq = 1;
+
+ /*
+ * although the tree mod log is per file system and not per transaction,
+ * the log must never go across transaction boundaries.
+ */
+ smp_mb();
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+ "creating a fresh transaction\n");
+ WARN_ON(1);
+ }
+ if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
+ printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+ "creating a fresh transaction\n");
+ WARN_ON(1);
+ }
+ atomic_set(&fs_info->tree_mod_seq, 0);
+
init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
- root->fs_info->btree_inode->i_mapping);
- root->fs_info->generation++;
- cur_trans->transid = root->fs_info->generation;
- root->fs_info->running_transaction = cur_trans;
+ fs_info->btree_inode->i_mapping);
+ fs_info->generation++;
+ cur_trans->transid = fs_info->generation;
+ fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_run_dev_stats(trans, root->fs_info);
+ BUG_ON(ret);
+
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eb1ae908582..2017d0ff511 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
level = btrfs_header_level(eb);
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
WARN_ON(*level <= 0);
if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
btrfs_release_path(path);
+ if (ret > 0)
+ ret = 0;
return ret;
}
@@ -3028,21 +3040,6 @@ out:
return ret;
}
-static int inode_in_log(struct btrfs_trans_handle *trans,
- struct inode *inode)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
-
- mutex_lock(&root->log_mutex);
- if (BTRFS_I(inode)->logged_trans == trans->transid &&
- BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
- ret = 1;
- mutex_unlock(&root->log_mutex);
- return ret;
-}
-
-
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (inode_in_log(trans, inode)) {
+ if (btrfs_inode_in_log(inode, trans->transid)) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 12f5147bd2b..ab942f46b3d 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
*
* ulist = ulist_alloc();
* ulist_add(ulist, root);
- * elem = NULL;
+ * ULIST_ITER_INIT(&uiter);
*
- * while ((elem = ulist_next(ulist, elem)) {
+ * while ((elem = ulist_next(ulist, &uiter)) {
* for (all child nodes n in elem)
* ulist_add(ulist, n);
* do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
*
* The allocated ulist will be returned in an initialized state.
*/
-struct ulist *ulist_alloc(unsigned long gfp_mask)
+struct ulist *ulist_alloc(gfp_t gfp_mask)
{
struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
* unaltered.
*/
int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long gfp_mask)
+ gfp_t gfp_mask)
+{
+ return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long *old_aux, gfp_t gfp_mask)
{
int i;
for (i = 0; i < ulist->nnodes; ++i) {
- if (ulist->nodes[i].val == val)
+ if (ulist->nodes[i].val == val) {
+ if (old_aux)
+ *old_aux = ulist->nodes[i].aux;
return 0;
+ }
}
if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
- * @prev: previously returned element or %NULL to start iteration
+ * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
*
* Note: locking must be provided by the caller. In case of rwlocks only read
* locking is needed
*
- * This function is used to iterate an ulist. The iteration is started with
- * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
* end is reached. No guarantee is made with respect to the order in which
* the elements are returned. They might neither be returned in order of
* addition nor in ascending order.
* It is allowed to call ulist_add during an enumeration. Newly added items
* are guaranteed to show up in the running enumeration.
*/
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
{
- int next;
-
if (ulist->nnodes == 0)
return NULL;
-
- if (!prev)
- return &ulist->nodes[0];
-
- next = (prev - ulist->nodes) + 1;
- if (next < 0 || next >= ulist->nnodes)
+ if (uiter->i < 0 || uiter->i >= ulist->nnodes)
return NULL;
- return &ulist->nodes[next];
+ return &ulist->nodes[uiter->i++];
}
EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2e25dec58ec..21bdc8ec813 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
*/
#define ULIST_SIZE 16
+struct ulist_iterator {
+ int i;
+};
+
/*
* element of the list
*/
@@ -59,10 +63,15 @@ struct ulist {
void ulist_init(struct ulist *ulist);
void ulist_fini(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
-struct ulist *ulist_alloc(unsigned long gfp_mask);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long gfp_mask);
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+ gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long *old_aux, gfp_t gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist,
+ struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1411b99555a..7782020996f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
return -ENOMEM;
}
device->devid = devid;
+ device->dev_stats_valid = 0;
device->work.func = pending_bios_fn;
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
int ret = 0;
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
- return -EINVAL;
+ return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static void *merge_stripe_index_into_bio_private(void *bi_private,
+ unsigned int stripe_index)
+{
+ /*
+ * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
+ * at most 1.
+ * The alternative solution (instead of stealing bits from the
+ * pointer) would be to allocate an intermediate structure
+ * that contains the old private pointer plus the stripe_index.
+ */
+ BUG_ON((((uintptr_t)bi_private) & 3) != 0);
+ BUG_ON(stripe_index > 3);
+ return (void *)(((uintptr_t)bi_private) | stripe_index);
+}
+
+static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
+{
+ return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
+}
+
+static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
+{
+ return (unsigned int)((uintptr_t)bi_private) & 3;
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
int is_orig_bio = 0;
- if (err)
+ if (err) {
atomic_inc(&bbio->error);
+ if (err == -EIO || err == -EREMOTEIO) {
+ unsigned int stripe_index =
+ extract_stripe_index_from_bio_private(
+ bio->bi_private);
+ struct btrfs_device *dev;
+
+ BUG_ON(stripe_index >= bbio->num_stripes);
+ dev = bbio->stripes[stripe_index].dev;
+ if (bio->bi_rw & WRITE)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ else
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_READ_ERRS);
+ if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_print_on_error(dev);
+ }
+ }
if (bio == bbio->orig_bio)
is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bio = first_bio;
}
bio->bi_private = bbio;
+ bio->bi_private = merge_stripe_index_into_bio_private(
+ bio->bi_private, (unsigned int)dev_nr);
bio->bi_end_io = btrfs_end_bio;
bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
return ret;
}
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+ u64 logical, int mirror_num)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ int ret;
+ u64 map_length = 0;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_device *device;
+
+ BUG_ON(mirror_num == 0);
+ ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
+ mirror_num);
+ if (ret) {
+ BUG_ON(bbio != NULL);
+ return NULL;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
+ device = bbio->stripes[mirror_num - 1].dev;
+ kfree(bbio);
+ return device;
+}
+
int btrfs_read_chunk_tree(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
btrfs_free_path(path);
return ret;
}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_device *device;
+ struct btrfs_path *path = NULL;
+ int i;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ int item_size;
+ struct btrfs_dev_stats_item *ptr;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
+ device->name, (unsigned long long)device->devid);
+ __btrfs_reset_dev_stats(device);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ continue;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot,
+ struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_reset(device, i);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dev_root,
+ struct btrfs_device *device)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_stats_item *ptr;
+ int ret;
+ int i;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+ ret, device->name);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /* need to delete old one and insert a new one */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+ device->name, ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+ device->name, ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_set_dev_stats_value(eb, ptr, i,
+ btrfs_dev_stat_read(device, i));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->dev_stats_valid || !device->dev_stats_dirty)
+ continue;
+
+ ret = update_dev_stat_item(trans, dev_root, device);
+ if (!ret)
+ device->dev_stats_dirty = 0;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+ btrfs_dev_stat_inc(dev, index);
+ btrfs_dev_stat_print_on_error(dev);
+}
+
+void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+ if (!dev->dev_stats_valid)
+ return;
+ printk_ratelimited(KERN_ERR
+ "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ dev->name,
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+ printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ dev->name,
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats,
+ int reset_after_read)
+{
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int i;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!dev) {
+ printk(KERN_WARNING
+ "btrfs: get dev_stats failed, device not found\n");
+ return -ENODEV;
+ } else if (!dev->dev_stats_valid) {
+ printk(KERN_WARNING
+ "btrfs: get dev_stats failed, not yet valid\n");
+ return -ENODEV;
+ } else if (reset_after_read) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (stats->nr_items > i)
+ stats->values[i] =
+ btrfs_dev_stat_read_and_reset(dev, i);
+ else
+ btrfs_dev_stat_reset(dev, i);
+ }
+ } else {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (stats->nr_items > i)
+ stats->values[i] = btrfs_dev_stat_read(dev, i);
+ }
+ if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+ stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bb6b03f97aa..3406a88ca83 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/sort.h>
#include "async-thread.h"
+#include "ioctl.h"
#define BTRFS_STRIPE_LEN (64 * 1024)
@@ -106,6 +107,11 @@ struct btrfs_device {
struct completion flush_wait;
int nobarriers;
+ /* disk I/O failure stats. For detailed description refer to
+ * enum btrfs_dev_stat_values in ioctl.h */
+ int dev_stats_valid;
+ int dev_stats_dirty; /* counters need to be written to disk */
+ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+ u64 logical, int mirror_num);
+void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats,
+ int reset_after_read);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+ int index)
+{
+ atomic_inc(dev->dev_stat_values + index);
+ dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+ int index)
+{
+ return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+ int index)
+{
+ int ret;
+
+ ret = atomic_xchg(dev->dev_stat_values + index, 0);
+ dev->dev_stats_dirty = 1;
+ return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+ int index, unsigned long val)
+{
+ atomic_set(dev->dev_stat_values + index, val);
+ dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+ int index)
+{
+ btrfs_dev_stat_set(dev, index, 0);
+}
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e7a5659087e..3f4e2d69e83 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
diff --git a/fs/buffer.c b/fs/buffer.c
index ad5938ca357..838a9cf246b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3152,7 +3152,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
/*
* Buffer-head allocation
*/
-static struct kmem_cache *bh_cachep;
+static struct kmem_cache *bh_cachep __read_mostly;
/*
* Once the number of bh's in the machine exceeds this level, we start
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fbb2a643ef1..8e1b60e557b 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -40,38 +40,49 @@ struct ceph_nfs_confh {
u32 parent_name_hash;
} __attribute__ ((packed));
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
- int connectable)
+/*
+ * The presence of @parent_inode here tells us whether NFS wants a
+ * connectable file handle. However, we want to make a connectionable
+ * file handle unconditionally so that the MDS gets as much of a hint
+ * as possible. That means we only use @parent_dentry to indicate
+ * whether nfsd wants a connectable fh, and whether we should indicate
+ * failure from a too-small @max_len.
+ */
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+ struct inode *parent_inode)
{
int type;
struct ceph_nfs_fh *fh = (void *)rawfh;
struct ceph_nfs_confh *cfh = (void *)rawfh;
- struct dentry *parent;
- struct inode *inode = dentry->d_inode;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
+ struct dentry *dentry = d_find_alias(inode);
+ struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
- if (*max_len >= connected_handle_length) {
+ /* if we found an alias, generate a connectable fh */
+ if (*max_len >= connected_handle_length && dentry) {
dout("encode_fh %p connectable\n", dentry);
- cfh->ino = ceph_ino(dentry->d_inode);
+ spin_lock(&dentry->d_lock);
+ parent = dentry->d_parent;
+ cfh->ino = ceph_ino(inode);
cfh->parent_ino = ceph_ino(parent->d_inode);
cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
dentry);
*max_len = connected_handle_length;
type = 2;
+ spin_unlock(&dentry->d_lock);
} else if (*max_len >= handle_length) {
- if (connectable) {
+ if (parent_inode) {
+ /* nfsd wants connectable */
*max_len = connected_handle_length;
type = 255;
} else {
dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(dentry->d_inode);
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
type = 1;
}
@@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
- spin_unlock(&dentry->d_lock);
return type;
}
diff --git a/fs/compat.c b/fs/compat.c
index 3adf3d4c2cd..6161255fac4 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
{
int error;
struct file *file;
+ int fput_needed;
struct compat_readdir_callback buf;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.result = 0;
buf.dirent = dirent;
@@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
if (buf.result)
error = buf.result;
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
struct file * file;
struct compat_linux_dirent __user * lastdirent;
struct compat_getdents_callback buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
struct file * file;
struct linux_dirent64 __user * lastdirent;
struct compat_getdents_callback64 buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
diff --git a/fs/dcache.c b/fs/dcache.c
index 4435d8b3290..85c9e2bff8e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -683,8 +683,6 @@ EXPORT_SYMBOL(dget_parent);
/**
* d_find_alias - grab a hashed alias of inode
* @inode: inode in question
- * @want_discon: flag, used by d_splice_alias, to request
- * that only a DISCONNECTED alias be returned.
*
* If inode has a hashed alias, or is a directory and has any alias,
* acquire the reference to alias and return it. Otherwise return NULL.
@@ -693,10 +691,9 @@ EXPORT_SYMBOL(dget_parent);
* of a filesystem.
*
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that.
*/
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
{
struct dentry *alias, *discon_alias;
@@ -708,7 +705,7 @@ again:
if (IS_ROOT(alias) &&
(alias->d_flags & DCACHE_DISCONNECTED)) {
discon_alias = alias;
- } else if (!want_discon) {
+ } else {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
@@ -739,7 +736,7 @@ struct dentry *d_find_alias(struct inode *inode)
if (!list_empty(&inode->i_dentry)) {
spin_lock(&inode->i_lock);
- de = __d_find_alias(inode, 0);
+ de = __d_find_alias(inode);
spin_unlock(&inode->i_lock);
}
return de;
@@ -1650,9 +1647,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
if (inode && S_ISDIR(inode->i_mode)) {
spin_lock(&inode->i_lock);
- new = __d_find_alias(inode, 1);
+ new = __d_find_any_alias(inode);
if (new) {
- BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
d_move(new, dentry);
@@ -2482,7 +2478,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
struct dentry *alias;
/* Does an aliased dentry already exist? */
- alias = __d_find_alias(inode, 0);
+ alias = __d_find_alias(inode);
if (alias) {
actual = alias;
write_seqlock(&rename_lock);
@@ -2575,7 +2571,7 @@ static int prepend_path(const struct path *path,
bool slash = false;
int error = 0;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
@@ -2606,7 +2602,7 @@ static int prepend_path(const struct path *path,
error = prepend(buffer, buflen, "/", 1);
out:
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return error;
global_root:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ab35b113003..a07441a0a87 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -660,11 +660,10 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
char *lower_buf;
- size_t lower_bufsiz = PATH_MAX;
mm_segment_t old_fs;
int rc;
- lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
+ lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!lower_buf) {
rc = -ENOMEM;
goto out;
@@ -673,58 +672,29 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
set_fs(get_ds());
rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
(char __user *)lower_buf,
- lower_bufsiz);
+ PATH_MAX);
set_fs(old_fs);
if (rc < 0)
goto out;
- lower_bufsiz = rc;
rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
- lower_buf, lower_bufsiz);
+ lower_buf, rc);
out:
kfree(lower_buf);
return rc;
}
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- char *kbuf;
- size_t kbufsiz, copied;
+ char *buf;
+ size_t len = PATH_MAX;
int rc;
- rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+ rc = ecryptfs_readlink_lower(dentry, &buf, &len);
if (rc)
goto out;
- copied = min_t(size_t, bufsiz, kbufsiz);
- rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
- kfree(kbuf);
fsstack_copy_attr_atime(dentry->d_inode,
ecryptfs_dentry_to_lower(dentry)->d_inode);
-out:
- return rc;
-}
-
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- char *buf;
- int len = PAGE_SIZE, rc;
- mm_segment_t old_fs;
-
- /* Released in ecryptfs_put_link(); only release here on error */
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf) {
- buf = ERR_PTR(-ENOMEM);
- goto out;
- }
- old_fs = get_fs();
- set_fs(get_ds());
- rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
- set_fs(old_fs);
- if (rc < 0) {
- kfree(buf);
- buf = ERR_PTR(rc);
- } else
- buf[rc] = '\0';
+ buf[len] = '\0';
out:
nd_set_link(nd, buf);
return NULL;
@@ -1153,7 +1123,7 @@ out:
}
const struct inode_operations ecryptfs_symlink_iops = {
- .readlink = ecryptfs_readlink,
+ .readlink = generic_readlink,
.follow_link = ecryptfs_follow_link,
.put_link = ecryptfs_put_link,
.permission = ecryptfs_permission,
diff --git a/fs/exec.c b/fs/exec.c
index 52c9e2ff6e6..a79786a8d2c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -280,10 +280,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain);
- err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
- if (err)
- goto err;
-
err = insert_vm_struct(mm, vma);
if (err)
goto err;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b05acb79613..b0201ca6e9c 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -304,24 +304,23 @@ out:
/**
* export_encode_fh - default export_operations->encode_fh function
- * @dentry: the dentry to encode
+ * @inode: the object to encode
* @fh: where to store the file handle fragment
* @max_len: maximum length to store there
- * @connectable: whether to store parent information
+ * @parent: parent directory inode, if wanted
*
* This default encode_fh function assumes that the 32 inode number
* is suitable for locating an inode, and that the generation number
* can be used to check that it is still valid. It places them in the
* filehandle fragment where export_decode_fh expects to find them.
*/
-static int export_encode_fh(struct dentry *dentry, struct fid *fid,
- int *max_len, int connectable)
+static int export_encode_fh(struct inode *inode, struct fid *fid,
+ int *max_len, struct inode *parent)
{
- struct inode * inode = dentry->d_inode;
int len = *max_len;
int type = FILEID_INO32_GEN;
- if (connectable && (len < 4)) {
+ if (parent && (len < 4)) {
*max_len = 4;
return 255;
} else if (len < 2) {
@@ -332,14 +331,9 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
len = 2;
fid->i32.ino = inode->i_ino;
fid->i32.gen = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent->d_inode;
+ if (parent) {
fid->i32.parent_ino = parent->i_ino;
fid->i32.parent_gen = parent->i_generation;
- spin_unlock(&dentry->d_lock);
len = 4;
type = FILEID_INO32_GEN_PARENT;
}
@@ -352,11 +346,22 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
{
const struct export_operations *nop = dentry->d_sb->s_export_op;
int error;
+ struct dentry *p = NULL;
+ struct inode *inode = dentry->d_inode, *parent = NULL;
+ if (connectable && !S_ISDIR(inode->i_mode)) {
+ p = dget_parent(dentry);
+ /*
+ * note that while p might've ceased to be our parent already,
+ * it's still pinned by and still positive.
+ */
+ parent = p->d_inode;
+ }
if (nop->encode_fh)
- error = nop->encode_fh(dentry, fid->raw, max_len, connectable);
+ error = nop->encode_fh(inode, fid->raw, max_len, parent);
else
- error = export_encode_fh(dentry, fid, max_len, connectable);
+ error = export_encode_fh(inode, fid, max_len, parent);
+ dput(p);
return error;
}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1f319..c22f17021b6 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,6 +2,8 @@ config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select JBD2
select CRC16
+ select CRYPTO
+ select CRYPTO_CRC32C
help
This is the next generation of the ext3 filesystem.
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c45c41129a3..99b6324290d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -168,12 +168,14 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
ext4_free_group_clusters_set(sb, gdp, 0);
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
return;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
sb->s_blocksize * 8, bh->b_data);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
}
/* Return the number of free blocks in a block group. It is used when
@@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}
static int ext4_valid_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- unsigned int block_group,
- struct buffer_head *bh)
+ struct ext4_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh)
{
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
@@ -325,6 +330,23 @@ err_out:
block_group, bitmap_blk);
return 0;
}
+
+void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh)
+{
+ if (buffer_verified(bh))
+ return;
+
+ ext4_lock_group(sb, block_group);
+ if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
+ ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8))
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+}
+
/**
* ext4_read_block_bitmap()
* @sb: super block
@@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
if (bitmap_uptodate(bh))
- return bh;
+ goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
- return bh;
+ goto verify;
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
- return bh;
+ goto verify;
}
/*
* submit the buffer_head for reading
@@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
get_bh(bh);
submit_bh(READ, bh);
return bh;
+verify:
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ return bh;
}
/* Returns 0 on success, 1 on error */
@@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
- ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
return 0;
}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index fa3af81ac56..b319721da26 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -29,3 +29,86 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
#endif /* EXT4FS_DEBUG */
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
+
+ return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+ gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
+
+ if (provided == calculated)
+ return 1;
+
+ ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
+ return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+ gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b8678620264..aa39e600d15 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp,
continue;
}
+ /* Check the checksum */
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(inode,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+ "at offset %llu",
+ (unsigned long long)filp->f_pos);
+ filp->f_pos += sb->s_blocksize - offset;
+ continue;
+ }
+ set_buffer_verified(bh);
+
revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c21b1de51af..cfc4e01b3c8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#include <crypto/hash.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -298,7 +299,9 @@ struct ext4_group_desc
__le16 bg_free_inodes_count_lo;/* Free inodes count */
__le16 bg_used_dirs_count_lo; /* Directories count */
__le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
- __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
+ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */
+ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
__le16 bg_itable_unused_lo; /* Unused inodes count */
__le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
__le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
@@ -308,9 +311,19 @@ struct ext4_group_desc
__le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
__le16 bg_used_dirs_count_hi; /* Directories count MSB */
__le16 bg_itable_unused_hi; /* Unused inodes count MSB */
- __u32 bg_reserved2[3];
+ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */
+ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+ __u32 bg_reserved;
};
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+ sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+ sizeof(__le16))
+
/*
* Structure of a flex block group info
*/
@@ -650,7 +663,8 @@ struct ext4_inode {
__le16 l_i_file_acl_high;
__le16 l_i_uid_high; /* these 2 fields */
__le16 l_i_gid_high; /* were reserved2[0] */
- __u32 l_i_reserved2;
+ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+ __le16 l_i_reserved;
} linux2;
struct {
__le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
@@ -666,7 +680,7 @@ struct ext4_inode {
} masix2;
} osd2; /* OS dependent 2 */
__le16 i_extra_isize;
- __le16 i_pad1;
+ __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */
__le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
__le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
__le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
@@ -768,7 +782,7 @@ do { \
#define i_gid_low i_gid
#define i_uid_high osd2.linux2.l_i_uid_high
#define i_gid_high osd2.linux2.l_i_gid_high
-#define i_reserved2 osd2.linux2.l_i_reserved2
+#define i_checksum_lo osd2.linux2.l_i_checksum_lo
#elif defined(__GNU__)
@@ -908,6 +922,9 @@ struct ext4_inode_info {
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+
+ /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+ __u32 i_csum_seed;
};
/*
@@ -1001,6 +1018,9 @@ extern void ext4_set_bits(void *bm, int cur, int len);
#define EXT4_ERRORS_PANIC 3 /* Panic */
#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM 1
+
/*
* Structure of the super block
*/
@@ -1087,7 +1107,7 @@ struct ext4_super_block {
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
- __u8 s_reserved_char_pad;
+ __u8 s_checksum_type; /* metadata checksum algorithm used */
__le16 s_reserved_pad;
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
__le32 s_snapshot_inum; /* Inode number of active snapshot */
@@ -1113,7 +1133,8 @@ struct ext4_super_block {
__le32 s_usr_quota_inum; /* inode for tracking user quota */
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
- __le32 s_reserved[109]; /* Padding to the end of the block */
+ __le32 s_reserved[108]; /* Padding to the end of the block */
+ __le32 s_checksum; /* crc32c(superblock) */
};
#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1176,6 +1197,7 @@ struct ext4_sb_info {
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
struct completion s_kobj_unregister;
+ struct super_block *s_sb;
/* Journaling */
struct journal_s *s_journal;
@@ -1266,6 +1288,12 @@ struct ext4_sb_info {
/* record the last minlen when FITRIM is called. */
atomic_t s_last_trim_minblks;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
+
+ /* Precomputed FS UUID checksum for seeding other checksums */
+ __u32 s_csum_seed;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1414,6 +1442,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums. However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
@@ -1461,7 +1495,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
/*
* Default values for user and/or group using reserved blocks
@@ -1527,6 +1562,18 @@ struct ext4_dir_entry_2 {
};
/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+ __le32 det_reserved_zero1; /* Pretend to be unused */
+ __le16 det_rec_len; /* 12 */
+ __u8 det_reserved_zero2; /* Zero name length */
+ __u8 det_reserved_ft; /* 0xDE, fake file type */
+ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
+/*
* Ext4 directory file types. Only the low 3 bits are used. The
* other bits are reserved for now.
*/
@@ -1541,6 +1588,8 @@ struct ext4_dir_entry_2 {
#define EXT4_FT_MAX 8
+#define EXT4_FT_DIR_CSUM 0xDE
+
/*
* EXT4_DIR_PAD defines the directory entries boundaries
*
@@ -1609,6 +1658,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[crypto_shash_descsize(sbi->s_chksum_driver)];
+ } desc;
+ int err;
+
+ desc.shash.tfm = sbi->s_chksum_driver;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+
+ return *(u32 *)desc.ctx;
+}
+
#ifdef __KERNEL__
/* hash info structure used by the directory hash */
@@ -1741,7 +1809,8 @@ struct mmp_struct {
__le16 mmp_check_interval;
__le16 mmp_pad1;
- __le32 mmp_pad2[227];
+ __le32 mmp_pad2[226];
+ __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */
};
/* arguments passed to the mmp thread */
@@ -1784,8 +1853,24 @@ struct mmpd_data {
/* bitmap.c */
extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
/* balloc.c */
+extern void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1864,7 +1949,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
-extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
@@ -1936,6 +2021,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
extern int ext4_ext_migrate(struct inode *);
/* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1950,6 +2037,10 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es);
+extern void ext4_superblock_csum_set(struct super_block *sb,
+ struct ext4_super_block *es);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
@@ -2025,10 +2116,17 @@ extern void ext4_used_dirs_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+ struct ext4_group_desc *gdp);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+ return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
@@ -2225,6 +2323,9 @@ static inline void ext4_unlock_group(struct super_block *sb,
static inline void ext4_mark_super_dirty(struct super_block *sb)
{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ ext4_superblock_csum_set(sb, es);
if (EXT4_SB(sb)->s_journal == NULL)
sb->s_dirt =1;
}
@@ -2314,6 +2415,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
+extern int ext4_mmp_csum_verify(struct super_block *sb,
+ struct mmp_struct *mmp);
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 0f58b86e3a0..cb1b2c91996 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -63,9 +63,22 @@
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
*/
/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long. It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes. Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+ __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */
+};
+
+/*
* This is the extent on-disk structure.
* It's used at the bottom of the tree.
*/
@@ -101,6 +114,17 @@ struct ext4_extent_header {
#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+ (sizeof(struct ext4_extent_header) + \
+ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+ return (struct ext4_extent_tail *)(((void *)eh) +
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
/*
* Array of ext4_ext_path contains path to some extent.
* Creation/lookup routines use it for traversal/splitting/etc.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index aca17901758..90f7c2e84db 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
}
int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb)
+ handle_t *handle, struct super_block *sb,
+ int now)
{
struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
int err = 0;
if (ext4_handle_valid(handle)) {
+ ext4_superblock_csum_set(sb,
+ (struct ext4_super_block *)bh->b_data);
err = jbd2_journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
+ } else if (now) {
+ ext4_superblock_csum_set(sb,
+ (struct ext4_super_block *)bh->b_data);
+ mark_buffer_dirty(bh);
} else
sb->s_dirt = 1;
return err;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 83b20fcf940..f440e8f1841 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
struct buffer_head *bh);
int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb);
+ handle_t *handle, struct super_block *sb,
+ int now);
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
@@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
+#define ext4_handle_dirty_super_now(handle, sb) \
+ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1)
#define ext4_handle_dirty_super(handle, sb) \
- __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0)
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index abcdeab67f5..91341ec6e06 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -52,6 +52,46 @@
#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+static __le32 ext4_extent_block_csum(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+ return cpu_to_le32(csum);
+}
+
+static int ext4_extent_block_csum_verify(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ et = find_ext4_extent_tail(eh);
+ if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+ return 0;
+ return 1;
+}
+
+static void ext4_extent_block_csum_set(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ et = find_ext4_extent_tail(eh);
+ et->et_checksum = ext4_extent_block_csum(inode, eh);
+}
+
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
@@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
{
int err;
if (path->p_bh) {
+ ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
@@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid extent entries";
goto corrupted;
}
+ /* Verify checksum on non-root extent tree nodes */
+ if (ext_depth(inode) != depth &&
+ !ext4_extent_block_csum_verify(inode, eh)) {
+ error_msg = "extent tree corrupted";
+ goto corrupted;
+ }
return 0;
corrupted:
@@ -412,6 +459,26 @@ int ext4_ext_check_inode(struct inode *inode)
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
}
+static int __ext4_ext_check_block(const char *function, unsigned int line,
+ struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth,
+ struct buffer_head *bh)
+{
+ int ret;
+
+ if (buffer_verified(bh))
+ return 0;
+ ret = ext4_ext_check(inode, eh, depth);
+ if (ret)
+ return ret;
+ set_buffer_verified(bh);
+ return ret;
+}
+
+#define ext4_ext_check_block(inode, eh, depth, bh) \
+ __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
@@ -536,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
}
path->p_idx = l - 1;
- ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
@@ -668,8 +735,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
i = depth;
/* walk through the tree */
while (i) {
- int need_to_validate = 0;
-
ext_debug("depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -688,8 +753,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
goto err;
}
- /* validate the extent entries */
- need_to_validate = 1;
}
eh = ext_block_hdr(bh);
ppos++;
@@ -703,7 +766,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_hdr = eh;
i--;
- if (need_to_validate && ext4_ext_check(inode, eh, i))
+ if (ext4_ext_check_block(inode, eh, i, bh))
goto err;
}
@@ -914,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -992,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1089,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
else
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1344,7 +1410,8 @@ got_index:
return -EIO;
eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check_block(inode, eh,
+ path->p_depth - depth, bh)) {
put_bh(bh);
return -EIO;
}
@@ -1357,7 +1424,7 @@ got_index:
if (bh == NULL)
return -EIO;
eh = ext_block_hdr(bh);
- if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
put_bh(bh);
return -EIO;
}
@@ -2644,8 +2711,8 @@ cont:
err = -EIO;
break;
}
- if (ext4_ext_check(inode, ext_block_hdr(bh),
- depth - i - 1)) {
+ if (ext4_ext_check_block(inode, ext_block_hdr(bh),
+ depth - i - 1, bh)) {
err = -EIO;
break;
}
@@ -4722,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
/* Now release the pages */
if (last_page_offset > first_page_offset) {
- truncate_inode_pages_range(mapping, first_page_offset,
- last_page_offset-1);
+ truncate_pagecache_range(inode, first_page_offset,
+ last_page_offset - 1);
}
/* finish any pending end_io work */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f1812a7..8c7642a0005 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int unaligned_aio = 0;
- int ret;
+ ssize_t ret;
/*
* If we have encountered a bitmap-format file, the size limit
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9f9acac6c43..d48e8b14928 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,24 +70,27 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
ext4_free_group_clusters_set(sb, gdp, 0);
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
return 0;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
bh->b_data);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
return EXT4_INODES_PER_GROUP(sb);
}
@@ -128,12 +131,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
return NULL;
}
if (bitmap_uptodate(bh))
- return bh;
+ goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
- return bh;
+ goto verify;
}
ext4_lock_group(sb, block_group);
@@ -141,6 +144,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
@@ -154,7 +158,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
- return bh;
+ goto verify;
}
/*
* submit the buffer_head for reading
@@ -171,6 +175,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
block_group, bitmap_blk);
return NULL;
}
+
+verify:
+ ext4_lock_group(sb, block_group);
+ if (!buffer_verified(bh) &&
+ !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ put_bh(bh);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, bitmap_blk);
+ return NULL;
+ }
+ ext4_unlock_group(sb, block_group);
+ set_buffer_verified(bh);
return bh;
}
@@ -276,7 +294,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_used_dirs_set(sb, gdp, count);
percpu_counter_dec(&sbi->s_dirs_counter);
}
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
percpu_counter_inc(&sbi->s_freeinodes_counter);
@@ -488,10 +508,12 @@ fallback_retry:
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- grp_free = ext4_free_inodes_count(sb, desc);
- if (desc && grp_free && grp_free >= avefreei) {
- *group = grp;
- return 0;
+ if (desc) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (grp_free && grp_free >= avefreei) {
+ *group = grp;
+ return 0;
+ }
}
}
@@ -709,7 +731,7 @@ repeat_in_this_group:
got:
/* We may have to initialize the block bitmap if it isn't already */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+ if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
struct buffer_head *block_bitmap_bh;
@@ -731,8 +753,11 @@ got:
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
- gdp);
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh,
+ EXT4_BLOCKS_PER_GROUP(sb) /
+ 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
@@ -751,7 +776,7 @@ got:
goto fail;
/* Update the relevant bg descriptor fields */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ if (ext4_has_group_desc_csum(sb)) {
int free;
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -772,7 +797,10 @@ got:
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
up_read(&grp->alloc_sem);
+ } else {
+ ext4_lock_group(sb, group);
}
+
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (S_ISDIR(mode)) {
ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
@@ -782,10 +810,12 @@ got:
atomic_inc(&sbi->s_flex_groups[f].used_dirs);
}
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
- ext4_unlock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
+ ext4_unlock_group(sb, group);
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
@@ -850,6 +880,19 @@ got:
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = cpu_to_le32(inode->i_generation);
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ext4_set_inode_state(inode, EXT4_STATE_NEW);
@@ -1140,7 +1183,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
skip_zeroout:
ext4_lock_group(sb, group);
gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
BUFFER_TRACE(group_desc_bh,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 07eaf565fdc..02bc8cbe728 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,6 +47,73 @@
#define MPAGE_DA_EXTENT_TAIL 0x01
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u16 csum_lo;
+ __u16 csum_hi = 0;
+ __u32 csum;
+
+ csum_lo = raw->i_checksum_lo;
+ raw->i_checksum_lo = 0;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum_hi = raw->i_checksum_hi;
+ raw->i_checksum_hi = 0;
+ }
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+ EXT4_INODE_SIZE(inode->i_sb));
+
+ raw->i_checksum_lo = csum_lo;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = csum_hi;
+
+ return csum;
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
+{
+ __u32 provided, calculated;
+
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(raw->i_checksum_lo);
+ calculated = ext4_inode_csum(inode, raw, ei);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+ else
+ calculated &= 0xFFFF;
+
+ return provided == calculated;
+}
+
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
+{
+ __u32 csum;
+
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_inode_csum(inode, raw, ei);
+ raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum >> 16);
+}
+
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
@@ -3517,8 +3584,7 @@ make_io:
b = table;
end = b + EXT4_SB(sb)->s_inode_readahead_blks;
num = EXT4_INODES_PER_GROUP(sb);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
@@ -3646,6 +3712,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ ret = -EIO;
+ goto bad_inode;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = raw_inode->i_generation;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid");
+ ret = -EIO;
+ goto bad_inode;
+ }
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -3725,12 +3824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- ret = -EIO;
- goto bad_inode;
- }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext4_inode) -
@@ -3742,8 +3835,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
}
- } else
- ei->i_extra_isize = 0;
+ }
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
@@ -3942,7 +4034,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
ext4_handle_sync(handle);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_super_now(handle, sb);
}
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -3969,6 +4061,8 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
+ ext4_inode_csum_set(inode, raw_inode, ei);
+
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
@@ -4213,7 +4307,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* will return the blocks that include the delayed allocation
* blocks for this file.
*/
- delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+ EXT4_I(inode)->i_reserved_data_blocks);
stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
return 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eee25591b8..8ad112ae0ad 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
handle_t *handle = NULL;
int err, migrate = 0;
struct ext4_iloc iloc;
- unsigned int oldflags;
+ unsigned int oldflags, mask, i;
unsigned int jflag;
if (!inode_owner_or_capable(inode))
@@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (err)
goto flags_err;
- flags = flags & EXT4_FL_USER_MODIFIABLE;
- flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
ei->i_flags = flags;
ext4_set_inode_flags(inode);
@@ -152,6 +158,13 @@ flags_out:
if (!inode_owner_or_capable(inode))
return -EPERM;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ ext4_warning(sb, "Setting inode version is not "
+ "supported with metadata_csum enabled.");
+ return -ENOTTY;
+ }
+
err = mnt_want_write_file(filp);
if (err)
return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99ab428bcfa..1cd6994fc44 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
- struct buffer_head **bh;
+ struct buffer_head **bh = NULL;
struct inode *inode;
char *data;
char *bitmap;
@@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
@@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
@@ -2797,7 +2798,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
ext4_free_group_clusters_set(sb, gdp, len);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
@@ -3071,13 +3074,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
- int len;
-
- if (pa && pa->pa_type == MB_INODE_PA) {
- len = ac->ac_b_ex.fe_len;
- pa->pa_free += len;
- }
+ if (pa && pa->pa_type == MB_INODE_PA)
+ pa->pa_free += ac->ac_b_ex.fe_len;
}
/*
@@ -4636,6 +4635,7 @@ do_more:
*/
new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
+ ext4_mb_unload_buddy(&e4b);
err = -ENOMEM;
goto error_return;
}
@@ -4659,7 +4659,9 @@ do_more:
ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
ext4_free_group_clusters_set(sb, gdp, ret);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
@@ -4803,7 +4805,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
mb_free_blocks(NULL, &e4b, bit, count);
blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
ext4_free_group_clusters_set(sb, desc, blk_free_count);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
EXT4_B2C(sbi, blocks_freed));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index ed6548d8916..f99a1311e84 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -6,12 +6,45 @@
#include "ext4.h"
+/* Checksumming functions */
+static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct mmp_struct, mmp_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+ return cpu_to_le32(csum);
+}
+
+int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
/*
* Write the MMP block using WRITE_SYNC to try to get the block on-disk
* faster.
*/
-static int write_mmp_block(struct buffer_head *bh)
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
+ struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+ ext4_mmp_csum_set(sb, mmp);
mark_buffer_dirty(bh);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
@@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+ !ext4_mmp_csum_verify(sb, mmp))
return -EINVAL;
return 0;
@@ -120,7 +154,7 @@ static int kmmpd(void *data)
mmp->mmp_time = cpu_to_le64(get_seconds());
last_update_time = jiffies;
- retval = write_mmp_block(bh);
+ retval = write_mmp_block(sb, bh);
/*
* Don't spew too many error messages. Print one every
* (s_mmp_update_interval * 60) seconds.
@@ -200,7 +234,7 @@ static int kmmpd(void *data)
mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
mmp->mmp_time = cpu_to_le64(get_seconds());
- retval = write_mmp_block(bh);
+ retval = write_mmp_block(sb, bh);
failed:
kfree(data);
@@ -299,7 +333,7 @@ skip:
seq = mmp_new_seq();
mmp->mmp_seq = cpu_to_le32(seq);
- retval = write_mmp_block(bh);
+ retval = write_mmp_block(sb, bh);
if (retval)
goto failed;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e2a3f4b0ff7..5845cd97bf8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -145,6 +145,14 @@ struct dx_map_entry
u16 size;
};
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+ u32 dt_reserved;
+ __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -180,6 +188,230 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
+/* checksumming functions */
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+ ((blocksize) - \
+ sizeof(struct ext4_dir_entry_tail))))
+
+static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize)
+{
+ memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+ t->det_rec_len = ext4_rec_len_to_disk(
+ sizeof(struct ext4_dir_entry_tail), blocksize);
+ t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+ struct ext4_dir_entry *de)
+{
+ struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+ struct ext4_dir_entry *d, *top;
+
+ d = de;
+ top = (struct ext4_dir_entry *)(((void *)de) +
+ (EXT4_BLOCK_SIZE(inode->i_sb) -
+ sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && d->rec_len)
+ d = (struct ext4_dir_entry *)(((void *)d) +
+ le16_to_cpu(d->rec_len));
+
+ if (d != top)
+ return NULL;
+
+ t = (struct ext4_dir_entry_tail *)d;
+#else
+ t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+ if (t->det_reserved_zero1 ||
+ le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ t->det_reserved_zero2 ||
+ t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+ return NULL;
+
+ return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+ struct ext4_dir_entry *dirent, int size)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ return cpu_to_le32(csum);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+ "leaf for checksum. Please run e2fsck -D.");
+ return 0;
+ }
+
+ if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent))
+ return 0;
+
+ return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+ "leaf for checksum. Please run e2fsck -D.");
+ return;
+ }
+
+ t->det_checksum = ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent);
+}
+
+static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+ struct ext4_dir_entry *dirent,
+ int *offset)
+{
+ struct ext4_dir_entry *dp;
+ struct dx_root_info *root;
+ int count_offset;
+
+ if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ count_offset = 8;
+ else if (le16_to_cpu(dirent->rec_len) == 12) {
+ dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+ if (le16_to_cpu(dp->rec_len) !=
+ EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ return NULL;
+ root = (struct dx_root_info *)(((void *)dp + 12));
+ if (root->reserved_zero ||
+ root->info_length != sizeof(struct dx_root_info))
+ return NULL;
+ count_offset = 32;
+ } else
+ return NULL;
+
+ if (offset)
+ *offset = count_offset;
+ return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+ int count_offset, int count, struct dx_tail *t)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum, old_csum;
+ int size;
+
+ size = count_offset + (count * sizeof(struct dx_entry));
+ old_csum = t->dt_checksum;
+ t->dt_checksum = 0;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+ t->dt_checksum = old_csum;
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return 1;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+ "tree checksum found. Run e2fsck -D.");
+ return 1;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+ count, t))
+ return 0;
+ return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+ "tree checksum. Run e2fsck -D.");
+ return;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
/*
* p is at least 6 bytes before the end of page
*/
@@ -239,12 +471,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -390,6 +630,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail;
}
+ if (!buffer_verified(bh) &&
+ !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
+ ext4_warning(dir->i_sb, "Root failed checksum");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+ set_buffer_verified(bh);
+
entries = (struct dx_entry *) (((char *)&root->info) +
root->info.info_length);
@@ -450,6 +699,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
goto fail2;
at = entries = ((struct dx_node *) bh->b_data)->entries;
+
+ if (!buffer_verified(bh) &&
+ !ext4_dx_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ ext4_warning(dir->i_sb, "Node failed checksum");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+ set_buffer_verified(bh);
+
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
"dx entry: limit != node limit");
@@ -549,6 +809,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
0, &err)))
return err; /* Failure */
+
+ if (!buffer_verified(bh) &&
+ !ext4_dx_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ ext4_warning(dir->i_sb, "Node failed checksum");
+ return -EIO;
+ }
+ set_buffer_verified(bh);
+
p++;
brelse(p->bh);
p->bh = bh;
@@ -577,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
return err;
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+ return -EIO;
+ set_buffer_verified(bh);
+
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
@@ -936,6 +1210,15 @@ restart:
brelse(bh);
goto next;
}
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(dir, "checksumming directory "
+ "block %lu", (unsigned long)block);
+ brelse(bh);
+ goto next;
+ }
+ set_buffer_verified(bh);
i = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
@@ -987,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
goto errout;
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(dir, "checksumming directory "
+ "block %lu", (unsigned long)block);
+ brelse(bh);
+ *err = -EIO;
+ goto errout;
+ }
+ set_buffer_verified(bh);
retval = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
@@ -1037,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
+ if (unlikely(ino == dir->i_ino)) {
+ EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
+ dentry->d_name.len,
+ dentry->d_name.name);
+ return ERR_PTR(-EIO);
+ }
inode = ext4_iget(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
@@ -1156,8 +1455,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
char *data1 = (*bh)->b_data, *data2;
unsigned split, move, size;
struct ext4_dir_entry_2 *de = NULL, *de2;
+ struct ext4_dir_entry_tail *t;
+ int csum_size = 0;
int err = 0, i;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
bh2 = ext4_append (handle, dir, &newblock, &err);
if (!(bh2)) {
brelse(*bh);
@@ -1204,10 +1509,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* Fancy dance to stay within two buffers */
de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
blocksize);
- de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+ de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+ (char *) de2,
blocksize);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data2, blocksize);
+ initialize_dirent_tail(t, blocksize);
+
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1218,10 +1533,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
de = de2;
}
dx_insert_block(frame, hash2 + continued, newblock);
- err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
if (err)
goto journal_error;
- err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
brelse(bh2);
@@ -1258,11 +1573,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
unsigned short reclen;
int nlen, rlen, err;
char *top;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + blocksize - reclen;
+ top = bh->b_data + (blocksize - csum_size) - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
return -EIO;
@@ -1295,11 +1615,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
de = de1;
}
de->file_type = EXT4_FT_UNKNOWN;
- if (inode) {
- de->inode = cpu_to_le32(inode->i_ino);
- ext4_set_de_type(dir->i_sb, de, inode->i_mode);
- } else
- de->inode = 0;
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(dir->i_sb, de, inode->i_mode);
de->name_len = namelen;
memcpy(de->name, name, namelen);
/*
@@ -1318,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
return 0;
@@ -1339,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
+ struct ext4_dir_entry_tail *t;
char *data1, *top;
unsigned len;
int retval;
@@ -1346,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_hash_info hinfo;
ext4_lblk_t block;
struct fake_dirent *fde;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
@@ -1366,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return -EIO;
}
- len = ((char *) root) + blocksize - (char *) de;
+ len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
bh2 = ext4_append(handle, dir, &block, &retval);
@@ -1382,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
top = data1 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
@@ -1408,8 +1738,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->bh = bh;
bh = bh2;
- ext4_handle_dirty_metadata(handle, dir, frame->bh);
- ext4_handle_dirty_metadata(handle, dir, bh);
+ ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ ext4_handle_dirty_dirent_node(handle, dir, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
if (!de) {
@@ -1445,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
struct super_block *sb;
int retval;
int dx_fallback=0;
unsigned blocksize;
ext4_lblk_t block, blocks;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
blocksize = sb->s_blocksize;
@@ -1468,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
bh = ext4_bread(handle, dir, block, 0, &retval);
if(!bh)
return retval;
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data))
+ return -EIO;
+ set_buffer_verified(bh);
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC) {
brelse(bh);
@@ -1484,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
if (retval == 0)
@@ -1516,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
goto cleanup;
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+ goto journal_error;
+ set_buffer_verified(bh);
+
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -1583,7 +1935,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
dxtrace(dx_show_index("node", frames[1].entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
@@ -1609,7 +1961,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
if (err) {
ext4_std_error(inode->i_sb, err);
goto cleanup;
@@ -1641,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle,
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
+ int csum_size = 0;
int i, err;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
i = 0;
pde = NULL;
de = (struct ext4_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size) {
+ while (i < bh->b_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh, i))
return -EIO;
if (de == de_del) {
@@ -1667,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle,
de->inode = 0;
dir->i_version++;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
if (unlikely(err)) {
ext4_std_error(dir->i_sb, err);
return err;
@@ -1809,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
unsigned int blocksize = dir->i_sb->s_blocksize;
+ int csum_size = 0;
int err, retries = 0;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
@@ -1852,16 +2215,24 @@ retry:
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + EXT4_DIR_REC_LEN(1)),
blocksize);
de->name_len = 2;
strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
set_nlink(inode, 2);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
if (err)
goto out_clear_inode;
+ set_buffer_verified(dir_block);
err = ext4_mark_inode_dirty(handle, inode);
if (!err)
err = ext4_add_entry(handle, dentry, inode);
@@ -1911,6 +2282,14 @@ static int empty_dir(struct inode *inode)
inode->i_ino);
return 1;
}
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(inode,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(inode, "checksum error reading directory "
+ "lblock 0");
+ return -EIO;
+ }
+ set_buffer_verified(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -1942,6 +2321,14 @@ static int empty_dir(struct inode *inode)
offset += sb->s_blocksize;
continue;
}
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(inode,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(inode, "checksum error "
+ "reading directory lblock 0");
+ return -EIO;
+ }
+ set_buffer_verified(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
@@ -2010,7 +2397,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_super_now(handle, sb);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -2083,7 +2470,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ err = ext4_handle_dirty_super_now(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
@@ -2442,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
+ if (!buffer_verified(dir_bh) &&
+ !ext4_dirent_csum_verify(old_inode,
+ (struct ext4_dir_entry *)dir_bh->b_data))
+ goto end_rename;
+ set_buffer_verified(dir_bh);
if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
goto end_rename;
@@ -2472,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_current_time(new_dir);
ext4_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+ retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
if (unlikely(retval)) {
ext4_std_error(new_dir->i_sb, retval);
goto end_rename;
@@ -2526,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
+ retval = ext4_handle_dirty_dirent_node(handle, old_inode,
+ dir_bh);
if (retval) {
ext4_std_error(old_dir->i_sb, retval);
goto end_rename;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 59fa0be2725..7ea6cbb4412 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ goto out2;
flex_gd->count = flexbg_size;
flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
@@ -796,7 +798,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
ext4_kvfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_super_now(handle, sb);
if (err)
ext4_std_error(sb, err);
@@ -968,6 +970,8 @@ static void update_backups(struct super_block *sb,
goto exit_err;
}
+ ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+
while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
struct buffer_head *bh;
@@ -1067,6 +1071,54 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
return err;
}
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+ struct buffer_head *bh = sb_getblk(sb, block);
+ if (!bh)
+ return NULL;
+
+ if (bitmap_uptodate(bh))
+ return bh;
+
+ lock_buffer(bh);
+ if (bh_submit_read(bh) < 0) {
+ unlock_buffer(bh);
+ brelse(bh);
+ return NULL;
+ }
+ unlock_buffer(bh);
+
+ return bh;
+}
+
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+ ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct ext4_new_group_data *group_data)
+{
+ struct buffer_head *bh;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ brelse(bh);
+
+ bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_block_bitmap_csum_set(sb, group, gdp, bh,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ brelse(bh);
+
+ return 0;
+}
+
/*
* ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
*/
@@ -1093,18 +1145,24 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
*/
gdb_bh = sbi->s_group_desc[gdb_num];
/* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+ gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
memset(gdp, 0, EXT4_DESC_SIZE(sb));
ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+ if (err) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
ext4_inode_table_set(sb, gdp, group_data->inode_table);
ext4_free_group_clusters_set(sb, gdp,
EXT4_B2C(sbi, group_data->free_blocks_count));
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
gdp->bg_flags = cpu_to_le16(*bg_flags);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ ext4_group_desc_csum_set(sb, group, gdp);
err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
if (unlikely(err)) {
@@ -1343,17 +1401,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
(1 + ext4_bg_num_gdb(sb, group + i) +
le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
group_data[i].free_blocks_count = blocks_per_group - overhead;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_group_desc_csum(sb))
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
else
flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
}
- if (last_group == n_group &&
- EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (last_group == n_group && ext4_has_group_desc_csum(sb))
/* We need to initialize block bitmap of last group. */
flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 35b5954489e..eb7aa3e4ef0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -112,6 +112,48 @@ static struct file_system_type ext3_fs_type = {
#define IS_EXT3_SB(sb) (0)
#endif
+static int ext4_verify_csum_type(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct ext4_super_block, s_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+ return cpu_to_le32(csum);
+}
+
+int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
void *ext4_kvmalloc(size_t size, gfp_t flags)
{
void *ret;
@@ -497,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -905,6 +948,8 @@ static void ext4_put_super(struct super_block *sb)
unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
}
@@ -1922,43 +1967,69 @@ failed:
return 0;
}
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
- struct ext4_group_desc *gdp)
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+ struct ext4_group_desc *gdp)
{
+ int offset;
__u16 crc = 0;
+ __le32 le_group = cpu_to_le32(block_group);
- if (sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- int offset = offsetof(struct ext4_group_desc, bg_checksum);
- __le32 le_group = cpu_to_le32(block_group);
-
- crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
- crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
- crc = crc16(crc, (__u8 *)gdp, offset);
- offset += sizeof(gdp->bg_checksum); /* skip checksum */
- /* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
- crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ if ((sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+ /* Use new metadata_csum algorithm */
+ __u16 old_csum;
+ __u32 csum32;
+
+ old_csum = gdp->bg_checksum;
+ gdp->bg_checksum = 0;
+ csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ sizeof(le_group));
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+ sbi->s_desc_size);
+ gdp->bg_checksum = old_csum;
+
+ crc = csum32 & 0xFFFF;
+ goto out;
}
+ /* old crc16 code */
+ offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+ crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+ crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+ crc = crc16(crc, (__u8 *)gdp, offset);
+ offset += sizeof(gdp->bg_checksum); /* skip checksum */
+ /* for checksum of struct ext4_group_desc do the rest...*/
+ if ((sbi->s_es->s_feature_incompat &
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ crc = crc16(crc, (__u8 *)gdp + offset,
+ le16_to_cpu(sbi->s_es->s_desc_size) -
+ offset);
+
+out:
return cpu_to_le16(crc);
}
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
- if ((sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
- (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+ block_group, gdp)))
return 0;
return 1;
}
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ if (!ext4_has_group_desc_csum(sb))
+ return;
+ gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
ext4_group_t *first_not_zeroed)
@@ -2013,7 +2084,7 @@ static int ext4_check_descriptors(struct super_block *sb,
return 0;
}
ext4_lock_group(sb, i);
- if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Checksum for group %u failed (%u!=%u)",
i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
@@ -2417,6 +2488,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
return count;
}
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -2447,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2461,6 +2550,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(trigger_fs_error),
NULL,
};
@@ -2957,6 +3047,44 @@ static void ext4_destroy_lazyinit_thread(void)
kthread_stop(ext4_lazyinit_task);
}
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+ int ret = 1;
+ int compat, incompat;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ /* journal checksum v2 */
+ compat = 0;
+ incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ } else {
+ /* journal checksum v1 */
+ compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+ incompat = 0;
+ }
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ incompat);
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ incompat);
+ jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ JBD2_FEATURE_INCOMPAT_CSUM_V2);
+ }
+
+ return ret;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -2993,6 +3121,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_orig;
}
sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
sbi->s_mount_opt = 0;
sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
@@ -3032,13 +3161,54 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* Note: s_es must be initialized as soon as possible because
* some ext4 macro-instructions depend on its value
*/
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT4_SUPER_MAGIC)
goto cantfind_ext4;
sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Load the checksum driver */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ ret = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto failed_mount;
+ }
+ }
+
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
set_opt(sb, INIT_INODE_TABLE);
@@ -3200,7 +3370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"Can't read superblock on 2nd try");
goto failed_mount;
}
- es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *)(bh->b_data + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
ext4_msg(sb, KERN_ERR,
@@ -3392,6 +3562,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
+ ret = -ENOMEM;
goto failed_mount;
}
@@ -3449,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
+ ret = err;
goto failed_mount3;
}
@@ -3506,26 +3678,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto no_journal;
}
- if (ext4_blocks_count(es) > 0xffffffffULL &&
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
goto failed_mount_wq;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
- jbd2_journal_clear_features(sbi->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto failed_mount_wq;
}
/* We have now updated the journal if required, so we can
@@ -3606,7 +3769,8 @@ no_journal:
goto failed_mount4;
}
- ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
+ if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -3641,7 +3805,7 @@ no_journal:
}
ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
+ err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
err);
@@ -3724,6 +3888,8 @@ failed_mount2:
brelse(sbi->s_group_desc[i]);
ext4_kvfree(sbi->s_group_desc);
failed_mount:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
if (sbi->s_proc) {
remove_proc_entry("options", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -3847,7 +4013,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -4039,6 +4205,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
&EXT4_SB(sb)->s_freeinodes_counter));
sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
+ ext4_superblock_csum_set(sb, es);
mark_buffer_dirty(sbh);
if (sync) {
error = sync_dirty_buffer(sbh);
@@ -4333,7 +4500,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
struct ext4_group_desc *gdp =
ext4_get_group_desc(sb, g, NULL);
- if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
ext4_msg(sb, KERN_ERR,
"ext4_remount: Checksum for group %u failed (%u!=%u)",
g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e88748e55c0..e56c9ed7d6e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum, old;
+
+ old = hdr->h_checksum;
+ hdr->h_checksum = 0;
+ if (le32_to_cpu(hdr->h_refcount) != 1) {
+ block_nr = cpu_to_le64(block_nr);
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+ sizeof(block_nr));
+ } else
+ csum = ei->i_csum_seed;
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+ EXT4_BLOCK_SIZE(inode->i_sb));
+ hdr->h_checksum = old;
+ return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+ return 0;
+ return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
static inline const struct xattr_handler *
ext4_xattr_handler(int name_index)
{
@@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
}
static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
{
+ int error;
+
+ if (buffer_verified(bh))
+ return 0;
+
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
return -EIO;
- return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+ return -EIO;
+ error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ if (!error)
+ set_buffer_verified(bh);
+ return error;
}
static inline int
@@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
@@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_release(ce);
unlock_buffer(bh);
- error = ext4_handle_dirty_metadata(handle, inode, bh);
+ error = ext4_handle_dirty_xattr_block(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
dquot_free_block(inode, 1);
@@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
- if (ext4_xattr_check_block(bs->bh)) {
+ if (ext4_xattr_check_block(inode, bs->bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error == -EIO)
goto bad_block;
if (!error)
- error = ext4_handle_dirty_metadata(handle,
- inode,
- bs->bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ bs->bh);
if (error)
goto cleanup;
goto inserted;
@@ -796,9 +858,9 @@ inserted:
ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
- error = ext4_handle_dirty_metadata(handle,
- inode,
- new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ new_bh);
if (error)
goto cleanup_dquot;
}
@@ -855,8 +917,8 @@ getblk_failed:
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
ext4_xattr_cache_insert(new_bh);
- error = ext4_handle_dirty_metadata(handle,
- inode, new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode, new_bh);
if (error)
goto cleanup;
}
@@ -1193,7 +1255,7 @@ retry:
error = -EIO;
if (!bh)
goto cleanup;
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 25b7387ff18..91f31ca7d9a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -27,7 +27,9 @@ struct ext4_xattr_header {
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __u32 h_reserved[4]; /* zero right now */
+ __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
+ /* id = inum if refcount=1, blknum otherwise */
+ __u32 h_reserved[3]; /* zero right now */
};
struct ext4_xattr_ibody_header {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c2973ea5df9..a3d81ebf6d8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -735,10 +735,9 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
}
static int
-fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
+fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
{
int len = *lenp;
- struct inode *inode = de->d_inode;
u32 ipos_h, ipos_m, ipos_l;
if (len < 5) {
@@ -754,9 +753,9 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
fh[1] = inode->i_generation;
fh[2] = ipos_h;
fh[3] = ipos_m | MSDOS_I(inode)->i_logstart;
- spin_lock(&de->d_lock);
- fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart;
- spin_unlock(&de->d_lock);
+ fh[4] = ipos_l;
+ if (parent)
+ fh[4] |= MSDOS_I(parent)->i_logstart;
return 3;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d078b75572a..81b70e665bf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -442,28 +442,24 @@ static int check_fcntl_cmd(unsigned cmd)
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
struct file *filp;
+ int fput_needed;
long err = -EBADF;
- filp = fget_raw(fd);
+ filp = fget_raw_light(fd, &fput_needed);
if (!filp)
goto out;
if (unlikely(filp->f_mode & FMODE_PATH)) {
- if (!check_fcntl_cmd(cmd)) {
- fput(filp);
- goto out;
- }
+ if (!check_fcntl_cmd(cmd))
+ goto out1;
}
err = security_file_fcntl(filp, cmd, arg);
- if (err) {
- fput(filp);
- return err;
- }
+ if (!err)
+ err = do_fcntl(fd, cmd, arg, filp);
- err = do_fcntl(fd, cmd, arg, filp);
-
- fput(filp);
+out1:
+ fput_light(filp, fput_needed);
out:
return err;
}
@@ -473,26 +469,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
struct file * filp;
- long err;
+ long err = -EBADF;
+ int fput_needed;
- err = -EBADF;
- filp = fget_raw(fd);
+ filp = fget_raw_light(fd, &fput_needed);
if (!filp)
goto out;
if (unlikely(filp->f_mode & FMODE_PATH)) {
- if (!check_fcntl_cmd(cmd)) {
- fput(filp);
- goto out;
- }
+ if (!check_fcntl_cmd(cmd))
+ goto out1;
}
err = security_file_fcntl(filp, cmd, arg);
- if (err) {
- fput(filp);
- return err;
- }
- err = -EBADF;
+ if (err)
+ goto out1;
switch (cmd) {
case F_GETLK64:
@@ -507,7 +498,8 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = do_fcntl(fd, cmd, arg, filp);
break;
}
- fput(filp);
+out1:
+ fput_light(filp, fput_needed);
out:
return err;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 70f2a0fd6ae..a305d9e2d1b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
-DECLARE_LGLOCK(files_lglock);
DEFINE_LGLOCK(files_lglock);
/* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
*/
void file_sb_list_add(struct file *file, struct super_block *sb)
{
- lg_local_lock(files_lglock);
+ lg_local_lock(&files_lglock);
__file_sb_list_add(file, sb);
- lg_local_unlock(files_lglock);
+ lg_local_unlock(&files_lglock);
}
/**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb)
void file_sb_list_del(struct file *file)
{
if (!list_empty(&file->f_u.fu_list)) {
- lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+ lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
list_del_init(&file->f_u.fu_list);
- lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+ lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
}
}
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *sb)
struct file *f;
retry:
- lg_global_lock(files_lglock);
+ lg_global_lock(&files_lglock);
do_file_list_for_each_entry(sb, f) {
struct vfsmount *mnt;
if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
file_release_write(f);
mnt = mntget(f->f_path.mnt);
/* This can sleep, so we can't hold the spinlock. */
- lg_global_unlock(files_lglock);
+ lg_global_unlock(&files_lglock);
mnt_drop_write(mnt);
mntput(mnt);
goto retry;
} while_file_list_for_each_entry;
- lg_global_unlock(files_lglock);
+ lg_global_unlock(&files_lglock);
}
void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mempages)
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
files_defer_init();
- lg_lock_init(files_lglock);
+ lg_lock_init(&files_lglock, "files_lglock");
percpu_counter_init(&nr_files, 0);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 504e61b7fd7..9562109d3a8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -962,7 +962,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- file_update_time(file);
+ err = file_update_time(file);
+ if (err)
+ goto out;
if (file->f_flags & O_DIRECT) {
written = generic_file_direct_write(iocb, iov, &nr_segs,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 56f6dcf3076..42678a33b7b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -627,12 +627,10 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
return ERR_PTR(err);
}
-static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
- int connectable)
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
- bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
- int len = encode_parent ? 6 : 3;
+ int len = parent ? 6 : 3;
u64 nodeid;
u32 generation;
@@ -648,14 +646,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
fh[1] = (u32)(nodeid & 0xffffffff);
fh[2] = generation;
- if (encode_parent) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent->d_inode;
+ if (parent) {
nodeid = get_fuse_inode(parent)->nodeid;
generation = parent->i_generation;
- spin_unlock(&dentry->d_lock);
fh[3] = (u32)(nodeid >> 32);
fh[4] = (u32)(nodeid & 0xffffffff);
@@ -663,7 +656,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
}
*max_len = len;
- return encode_parent ? 0x82 : 0x81;
+ return parent ? 0x82 : 0x81;
}
static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 70ba891654f..e8ed6d4a618 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -28,15 +28,14 @@
#define GFS2_LARGE_FH_SIZE 8
#define GFS2_OLD_FH_SIZE 10
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
- int connectable)
+static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
+ struct inode *parent)
{
__be32 *fh = (__force __be32 *)p;
- struct inode *inode = dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct gfs2_inode *ip = GFS2_I(inode);
- if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+ if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
*len = GFS2_LARGE_FH_SIZE;
return 255;
} else if (*len < GFS2_SMALL_FH_SIZE) {
@@ -50,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
*len = GFS2_SMALL_FH_SIZE;
- if (!connectable || inode == sb->s_root->d_inode)
+ if (!parent || inode == sb->s_root->d_inode)
return *len;
- spin_lock(&dentry->d_lock);
- inode = dentry->d_parent->d_inode;
- ip = GFS2_I(inode);
- igrab(inode);
- spin_unlock(&dentry->d_lock);
+ ip = GFS2_I(parent);
fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
@@ -65,8 +60,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
*len = GFS2_LARGE_FH_SIZE;
- iput(inode);
-
return *len;
}
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index 7a5eb2c718c..cdb84a83806 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -16,9 +16,9 @@
static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
{
struct quad_buffer_head qbh;
- u32 *bmp;
+ __le32 *bmp;
if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
- if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
+ if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
goto fail1;
}
@@ -62,7 +62,7 @@ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
{
struct quad_buffer_head qbh;
- unsigned *bmp;
+ __le32 *bmp;
unsigned bs = near & ~0x3fff;
unsigned nr = (near & 0x3fff) & ~(n - 1);
/*unsigned mnr;*/
@@ -236,7 +236,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
int hpfs_alloc_if_possible(struct super_block *s, secno sec)
{
struct quad_buffer_head qbh;
- u32 *bmp;
+ __le32 *bmp;
if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
@@ -254,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
{
struct quad_buffer_head qbh;
- u32 *bmp;
+ __le32 *bmp;
struct hpfs_sb_info *sbi = hpfs_sb(s);
/*printk("2 - ");*/
if (!n) return;
@@ -299,7 +299,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
int i, j;
- u32 *bmp;
+ __le32 *bmp;
struct quad_buffer_head qbh;
if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
for (j = 0; j < 512; j++) {
@@ -351,7 +351,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
hpfs_free_sectors(s, dno, 4);
} else {
struct quad_buffer_head qbh;
- u32 *bmp;
+ __le32 *bmp;
unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
return;
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 08b503e8ed2..4bae4a4a60b 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
int c1, c2 = 0;
go_down:
if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
- if (btree->internal) {
+ if (bp_internal(btree)) {
for (i = 0; i < btree->n_used_nodes; i++)
if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
a = le32_to_cpu(btree->u.internal[i].down);
@@ -82,7 +82,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
brelse(bh);
return -1;
}
- if (btree->internal) {
+ if (bp_internal(btree)) {
a = le32_to_cpu(btree->u.internal[n].down);
btree->u.internal[n].file_secno = cpu_to_le32(-1);
mark_buffer_dirty(bh);
@@ -129,12 +129,12 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
}
if (a == node && fnod) {
anode->up = cpu_to_le32(node);
- anode->btree.fnode_parent = 1;
+ anode->btree.flags |= BP_fnode_parent;
anode->btree.n_used_nodes = btree->n_used_nodes;
anode->btree.first_free = btree->first_free;
anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
- btree->internal = 1;
+ btree->flags |= BP_internal;
btree->n_free_nodes = 11;
btree->n_used_nodes = 1;
btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
@@ -184,7 +184,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
hpfs_free_sectors(s, ra, 1);
if ((anode = hpfs_map_anode(s, na, &bh))) {
anode->up = cpu_to_le32(up);
- anode->btree.fnode_parent = up == node && fnod;
+ if (up == node && fnod)
+ anode->btree.flags |= BP_fnode_parent;
+ else
+ anode->btree.flags &= ~BP_fnode_parent;
mark_buffer_dirty(bh);
brelse(bh);
}
@@ -198,7 +201,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
anode = new_anode;
/*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
- anode->btree.internal = 1;
+ anode->btree.flags |= BP_internal;
anode->btree.n_used_nodes = 1;
anode->btree.n_free_nodes = 59;
anode->btree.first_free = cpu_to_le16(16);
@@ -215,7 +218,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
}
if ((anode = hpfs_map_anode(s, na, &bh))) {
anode->up = cpu_to_le32(node);
- if (fnod) anode->btree.fnode_parent = 1;
+ if (fnod)
+ anode->btree.flags |= BP_fnode_parent;
mark_buffer_dirty(bh);
brelse(bh);
}
@@ -234,18 +238,19 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
}
ranode->up = cpu_to_le32(node);
memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
- if (fnod) ranode->btree.fnode_parent = 1;
- ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes;
- if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
+ if (fnod)
+ ranode->btree.flags |= BP_fnode_parent;
+ ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes;
+ if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
struct anode *unode;
if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
unode->up = cpu_to_le32(ra);
- unode->btree.fnode_parent = 0;
+ unode->btree.flags &= ~BP_fnode_parent;
mark_buffer_dirty(bh1);
brelse(bh1);
}
}
- btree->internal = 1;
+ btree->flags |= BP_internal;
btree->n_free_nodes = fnod ? 10 : 58;
btree->n_used_nodes = 2;
btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
@@ -278,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
int d1, d2;
go_down:
d2 = 0;
- while (btree1->internal) {
+ while (bp_internal(btree1)) {
ano = le32_to_cpu(btree1->u.internal[pos].down);
if (level) brelse(bh);
if (hpfs_sb(s)->sb_chk)
@@ -412,13 +417,13 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
btree->n_free_nodes = 8;
btree->n_used_nodes = 0;
btree->first_free = cpu_to_le16(8);
- btree->internal = 0;
+ btree->flags &= ~BP_internal;
mark_buffer_dirty(bh);
} else hpfs_free_sectors(s, f, 1);
brelse(bh);
return;
}
- while (btree->internal) {
+ while (bp_internal(btree)) {
nodes = btree->n_used_nodes + btree->n_free_nodes;
for (i = 0; i < btree->n_used_nodes; i++)
if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
@@ -479,13 +484,13 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
struct extended_attribute *ea;
struct extended_attribute *ea_end;
if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
- if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree);
+ if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree);
else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
ea_end = fnode_end_ea(fnode);
for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
- if (ea->indirect)
- hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
- hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l));
+ if (ea_indirect(ea))
+ hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
+ hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l));
brelse(bh);
hpfs_free_sectors(s, fno, 1);
}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2fa0089a02a..b8472f803f4 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -87,7 +87,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
ret = -EIOERROR;
goto out;
}
- if (!fno->dirflag) {
+ if (!fnode_is_dir(fno)) {
e = 1;
hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
(unsigned long)inode->i_ino);
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 1e0e2ac30fd..3228c524ebe 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -153,7 +153,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
}
de->length = cpu_to_le16(36);
de->down = 1;
- *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr);
+ *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr);
}
}
@@ -177,7 +177,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
memset(de, 0, d_size);
if (down_ptr) {
- *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
+ *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
de->down = 1;
}
de->length = cpu_to_le16(d_size);
@@ -656,7 +656,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
del->down = 0;
d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
} else if (down)
- *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
+ *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
} else goto endm;
if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
printk("HPFS: out of memory for dtree balancing\n");
@@ -672,7 +672,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
de_prev->down = 1;
dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
}
- *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
+ *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
@@ -1015,7 +1015,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
kfree(name2);
return NULL;
}
- if (!upf->dirflag) {
+ if (!fnode_is_dir(upf)) {
brelse(bh);
hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
kfree(name2);
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index d8b84d113c8..bcaafcd2666 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -23,15 +23,15 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
return;
}
if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
- if (ea->indirect) {
+ if (ea_indirect(ea)) {
if (ea_valuelen(ea) != 8) {
- hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x",
+ hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x",
ano ? "anode" : "sectors", a, pos);
return;
}
if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
return;
- hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
+ hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
}
pos += ea->namelen + ea_valuelen(ea) + 5;
}
@@ -81,7 +81,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
struct extended_attribute *ea_end = fnode_end_ea(fnode);
for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
if (!strcmp(ea->name, key)) {
- if (ea->indirect)
+ if (ea_indirect(ea))
goto indirect;
if (ea_valuelen(ea) >= size)
return -EINVAL;
@@ -91,7 +91,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
}
a = le32_to_cpu(fnode->ea_secno);
len = le32_to_cpu(fnode->ea_size_l);
- ano = fnode->ea_anode;
+ ano = fnode_in_anode(fnode);
pos = 0;
while (pos < len) {
ea = (struct extended_attribute *)ex;
@@ -101,10 +101,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
return -EIO;
}
if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
- if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+ if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
return -EIO;
if (!strcmp(ea->name, key)) {
- if (ea->indirect)
+ if (ea_indirect(ea))
goto indirect;
if (ea_valuelen(ea) >= size)
return -EINVAL;
@@ -119,7 +119,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
indirect:
if (ea_len(ea) >= size)
return -EINVAL;
- if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf))
+ if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf))
return -EIO;
buf[ea_len(ea)] = 0;
return 0;
@@ -136,8 +136,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
struct extended_attribute *ea_end = fnode_end_ea(fnode);
for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
if (!strcmp(ea->name, key)) {
- if (ea->indirect)
- return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+ if (ea_indirect(ea))
+ return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
printk("HPFS: out of memory for EA\n");
return NULL;
@@ -148,7 +148,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
}
a = le32_to_cpu(fnode->ea_secno);
len = le32_to_cpu(fnode->ea_size_l);
- ano = fnode->ea_anode;
+ ano = fnode_in_anode(fnode);
pos = 0;
while (pos < len) {
char ex[4 + 255 + 1 + 8];
@@ -159,11 +159,11 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
return NULL;
}
if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
- if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+ if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
return NULL;
if (!strcmp(ea->name, key)) {
- if (ea->indirect)
- return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+ if (ea_indirect(ea))
+ return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
printk("HPFS: out of memory for EA\n");
return NULL;
@@ -199,9 +199,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
struct extended_attribute *ea_end = fnode_end_ea(fnode);
for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
if (!strcmp(ea->name, key)) {
- if (ea->indirect) {
+ if (ea_indirect(ea)) {
if (ea_len(ea) == size)
- set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+ set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
} else if (ea_valuelen(ea) == size) {
memcpy(ea_data(ea), data, size);
}
@@ -209,7 +209,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
}
a = le32_to_cpu(fnode->ea_secno);
len = le32_to_cpu(fnode->ea_size_l);
- ano = fnode->ea_anode;
+ ano = fnode_in_anode(fnode);
pos = 0;
while (pos < len) {
char ex[4 + 255 + 1 + 8];
@@ -220,12 +220,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
return;
}
if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
- if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+ if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
return;
if (!strcmp(ea->name, key)) {
- if (ea->indirect) {
+ if (ea_indirect(ea)) {
if (ea_len(ea) == size)
- set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+ set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
}
else {
if (ea_valuelen(ea) == size)
@@ -246,7 +246,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
(unsigned long)inode->i_ino,
- le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+ le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
return;
}
if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
@@ -276,7 +276,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
fnode->ea_size_s = cpu_to_le16(0);
fnode->ea_secno = cpu_to_le32(n);
- fnode->ea_anode = cpu_to_le32(0);
+ fnode->flags &= ~FNODE_anode;
mark_buffer_dirty(bh);
brelse(bh);
}
@@ -288,9 +288,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
secno q = hpfs_alloc_sector(s, fno, 1, 0);
if (!q) goto bail;
fnode->ea_secno = cpu_to_le32(q);
- fnode->ea_anode = 0;
+ fnode->flags &= ~FNODE_anode;
len++;
- } else if (!fnode->ea_anode) {
+ } else if (!fnode_in_anode(fnode)) {
if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
len++;
} else {
@@ -310,7 +310,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
anode->u.external[0].length = cpu_to_le32(len);
mark_buffer_dirty(bh);
brelse(bh);
- fnode->ea_anode = 1;
+ fnode->flags |= FNODE_anode;
fnode->ea_secno = cpu_to_le32(a_s);*/
secno new_sec;
int i;
@@ -338,7 +338,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
len = (pos + 511) >> 9;
}
}
- if (fnode->ea_anode) {
+ if (fnode_in_anode(fnode)) {
if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
0, len) != -1) {
len++;
@@ -351,16 +351,16 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
h[1] = strlen(key);
h[2] = size & 0xff;
h[3] = size >> 8;
- if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
- if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
- if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
+ if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
+ if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
+ if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
fnode->ea_size_l = cpu_to_le32(pos);
ret:
hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
return;
bail:
if (le32_to_cpu(fnode->ea_secno))
- if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
+ if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
}
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 8b0650aae32..cce025aff1b 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -51,11 +51,11 @@ struct hpfs_boot_block
u8 n_rootdir_entries[2];
u8 n_sectors_s[2];
u8 media_byte;
- u16 sectors_per_fat;
- u16 sectors_per_track;
- u16 heads_per_cyl;
- u32 n_hidden_sectors;
- u32 n_sectors_l; /* size of partition */
+ __le16 sectors_per_fat;
+ __le16 sectors_per_track;
+ __le16 heads_per_cyl;
+ __le32 n_hidden_sectors;
+ __le32 n_sectors_l; /* size of partition */
u8 drive_number;
u8 mbz;
u8 sig_28h; /* 28h */
@@ -63,7 +63,7 @@ struct hpfs_boot_block
u8 vol_label[11];
u8 sig_hpfs[8]; /* "HPFS " */
u8 pad[448];
- u16 magic; /* aa55 */
+ __le16 magic; /* aa55 */
};
@@ -75,28 +75,28 @@ struct hpfs_boot_block
struct hpfs_super_block
{
- u32 magic; /* f995 e849 */
- u32 magic1; /* fa53 e9c5, more magic? */
+ __le32 magic; /* f995 e849 */
+ __le32 magic1; /* fa53 e9c5, more magic? */
u8 version; /* version of a filesystem usually 2 */
u8 funcversion; /* functional version - oldest version
of filesystem that can understand
this disk */
- u16 zero; /* 0 */
- fnode_secno root; /* fnode of root directory */
- secno n_sectors; /* size of filesystem */
- u32 n_badblocks; /* number of bad blocks */
- secno bitmaps; /* pointers to free space bit maps */
- u32 zero1; /* 0 */
- secno badblocks; /* bad block list */
- u32 zero3; /* 0 */
- time32_t last_chkdsk; /* date last checked, 0 if never */
- time32_t last_optimize; /* date last optimized, 0 if never */
- secno n_dir_band; /* number of sectors in dir band */
- secno dir_band_start; /* first sector in dir band */
- secno dir_band_end; /* last sector in dir band */
- secno dir_band_bitmap; /* free space map, 1 dnode per bit */
+ __le16 zero; /* 0 */
+ __le32 root; /* fnode of root directory */
+ __le32 n_sectors; /* size of filesystem */
+ __le32 n_badblocks; /* number of bad blocks */
+ __le32 bitmaps; /* pointers to free space bit maps */
+ __le32 zero1; /* 0 */
+ __le32 badblocks; /* bad block list */
+ __le32 zero3; /* 0 */
+ __le32 last_chkdsk; /* date last checked, 0 if never */
+ __le32 last_optimize; /* date last optimized, 0 if never */
+ __le32 n_dir_band; /* number of sectors in dir band */
+ __le32 dir_band_start; /* first sector in dir band */
+ __le32 dir_band_end; /* last sector in dir band */
+ __le32 dir_band_bitmap; /* free space map, 1 dnode per bit */
u8 volume_name[32]; /* not used */
- secno user_id_table; /* 8 preallocated sectors - user id */
+ __le32 user_id_table; /* 8 preallocated sectors - user id */
u32 zero6[103]; /* 0 */
};
@@ -109,8 +109,8 @@ struct hpfs_super_block
struct hpfs_spare_block
{
- u32 magic; /* f991 1849 */
- u32 magic1; /* fa52 29c5, more magic? */
+ __le32 magic; /* f991 1849 */
+ __le32 magic1; /* fa52 29c5, more magic? */
#ifdef __LITTLE_ENDIAN
u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */
@@ -153,21 +153,21 @@ struct hpfs_spare_block
u8 mm_contlgulty;
u8 unused;
- secno hotfix_map; /* info about remapped bad sectors */
- u32 n_spares_used; /* number of hotfixes */
- u32 n_spares; /* number of spares in hotfix map */
- u32 n_dnode_spares_free; /* spare dnodes unused */
- u32 n_dnode_spares; /* length of spare_dnodes[] list,
+ __le32 hotfix_map; /* info about remapped bad sectors */
+ __le32 n_spares_used; /* number of hotfixes */
+ __le32 n_spares; /* number of spares in hotfix map */
+ __le32 n_dnode_spares_free; /* spare dnodes unused */
+ __le32 n_dnode_spares; /* length of spare_dnodes[] list,
follows in this block*/
- secno code_page_dir; /* code page directory block */
- u32 n_code_pages; /* number of code pages */
- u32 super_crc; /* on HPFS386 and LAN Server this is
+ __le32 code_page_dir; /* code page directory block */
+ __le32 n_code_pages; /* number of code pages */
+ __le32 super_crc; /* on HPFS386 and LAN Server this is
checksum of superblock, on normal
OS/2 unused */
- u32 spare_crc; /* on HPFS386 checksum of spareblock */
- u32 zero1[15]; /* unused */
- dnode_secno spare_dnodes[100]; /* emergency free dnode list */
- u32 zero2[1]; /* room for more? */
+ __le32 spare_crc; /* on HPFS386 checksum of spareblock */
+ __le32 zero1[15]; /* unused */
+ __le32 spare_dnodes[100]; /* emergency free dnode list */
+ __le32 zero2[1]; /* room for more? */
};
/* The bad block list is 4 sectors long. The first word must be zero,
@@ -202,18 +202,18 @@ struct hpfs_spare_block
struct code_page_directory
{
- u32 magic; /* 4945 21f7 */
- u32 n_code_pages; /* number of pointers following */
- u32 zero1[2];
+ __le32 magic; /* 4945 21f7 */
+ __le32 n_code_pages; /* number of pointers following */
+ __le32 zero1[2];
struct {
- u16 ix; /* index */
- u16 code_page_number; /* code page number */
- u32 bounds; /* matches corresponding word
+ __le16 ix; /* index */
+ __le16 code_page_number; /* code page number */
+ __le32 bounds; /* matches corresponding word
in data block */
- secno code_page_data; /* sector number of a code_page_data
+ __le32 code_page_data; /* sector number of a code_page_data
containing c.p. array */
- u16 index; /* index in c.p. array in that sector*/
- u16 unknown; /* some unknown value; usually 0;
+ __le16 index; /* index in c.p. array in that sector*/
+ __le16 unknown; /* some unknown value; usually 0;
2 in Japanese version */
} array[31]; /* unknown length */
};
@@ -224,19 +224,19 @@ struct code_page_directory
struct code_page_data
{
- u32 magic; /* 8945 21f7 */
- u32 n_used; /* # elements used in c_p_data[] */
- u32 bounds[3]; /* looks a bit like
+ __le32 magic; /* 8945 21f7 */
+ __le32 n_used; /* # elements used in c_p_data[] */
+ __le32 bounds[3]; /* looks a bit like
(beg1,end1), (beg2,end2)
one byte each */
- u16 offs[3]; /* offsets from start of sector
+ __le16 offs[3]; /* offsets from start of sector
to start of c_p_data[ix] */
struct {
- u16 ix; /* index */
- u16 code_page_number; /* code page number */
- u16 unknown; /* the same as in cp directory */
+ __le16 ix; /* index */
+ __le16 code_page_number; /* code page number */
+ __le16 unknown; /* the same as in cp directory */
u8 map[128]; /* upcase table for chars 80..ff */
- u16 zero2;
+ __le16 zero2;
} code_page[3];
u8 incognita[78];
};
@@ -278,8 +278,8 @@ struct code_page_data
#define DNODE_MAGIC 0x77e40aae
struct dnode {
- u32 magic; /* 77e4 0aae */
- u32 first_free; /* offset from start of dnode to
+ __le32 magic; /* 77e4 0aae */
+ __le32 first_free; /* offset from start of dnode to
first free dir entry */
#ifdef __LITTLE_ENDIAN
u8 root_dnode: 1; /* Is it root dnode? */
@@ -293,14 +293,14 @@ struct dnode {
u8 root_dnode: 1; /* Is it root dnode? */
#endif
u8 increment_me2[3];
- secno up; /* (root dnode) directory's fnode
+ __le32 up; /* (root dnode) directory's fnode
(nonroot) parent dnode */
- dnode_secno self; /* pointer to this dnode */
+ __le32 self; /* pointer to this dnode */
u8 dirent[2028]; /* one or more dirents */
};
struct hpfs_dirent {
- u16 length; /* offset to next dirent */
+ __le16 length; /* offset to next dirent */
#ifdef __LITTLE_ENDIAN
u8 first: 1; /* set on phony ^A^A (".") entry */
@@ -346,12 +346,12 @@ struct hpfs_dirent {
u8 read_only: 1; /* dos attrib */
#endif
- fnode_secno fnode; /* fnode giving allocation info */
- time32_t write_date; /* mtime */
- u32 file_size; /* file length, bytes */
- time32_t read_date; /* atime */
- time32_t creation_date; /* ctime */
- u32 ea_size; /* total EA length, bytes */
+ __le32 fnode; /* fnode giving allocation info */
+ __le32 write_date; /* mtime */
+ __le32 file_size; /* file length, bytes */
+ __le32 read_date; /* atime */
+ __le32 creation_date; /* ctime */
+ __le32 ea_size; /* total EA length, bytes */
u8 no_of_acls; /* number of ACL's (low 3 bits) */
u8 ix; /* code page index (of filename), see
struct code_page_data */
@@ -375,50 +375,36 @@ struct hpfs_dirent {
struct bplus_leaf_node
{
- u32 file_secno; /* first file sector in extent */
- u32 length; /* length, sectors */
- secno disk_secno; /* first corresponding disk sector */
+ __le32 file_secno; /* first file sector in extent */
+ __le32 length; /* length, sectors */
+ __le32 disk_secno; /* first corresponding disk sector */
};
struct bplus_internal_node
{
- u32 file_secno; /* subtree maps sectors < this */
- anode_secno down; /* pointer to subtree */
+ __le32 file_secno; /* subtree maps sectors < this */
+ __le32 down; /* pointer to subtree */
};
+enum {
+ BP_hbff = 1,
+ BP_fnode_parent = 0x20,
+ BP_binary_search = 0x40,
+ BP_internal = 0x80
+};
struct bplus_header
{
-#ifdef __LITTLE_ENDIAN
- u8 hbff: 1; /* high bit of first free entry offset */
- u8 flag1234: 4;
- u8 fnode_parent: 1; /* ? we're pointed to by an fnode,
- the data btree or some ea or the
- main ea bootage pointer ea_secno */
- /* also can get set in fnodes, which
- may be a chkdsk glitch or may mean
- this bit is irrelevant in fnodes,
- or this interpretation is all wet */
- u8 binary_search: 1; /* suggest binary search (unused) */
- u8 internal: 1; /* 1 -> (internal) tree of anodes
- 0 -> (leaf) list of extents */
-#else
- u8 internal: 1; /* 1 -> (internal) tree of anodes
- 0 -> (leaf) list of extents */
- u8 binary_search: 1; /* suggest binary search (unused) */
- u8 fnode_parent: 1; /* ? we're pointed to by an fnode,
+ u8 flags; /* bit 0 - high bit of first free entry offset
+ bit 5 - we're pointed to by an fnode,
the data btree or some ea or the
- main ea bootage pointer ea_secno */
- /* also can get set in fnodes, which
- may be a chkdsk glitch or may mean
- this bit is irrelevant in fnodes,
- or this interpretation is all wet */
- u8 flag1234: 4;
- u8 hbff: 1; /* high bit of first free entry offset */
-#endif
+ main ea bootage pointer ea_secno
+ bit 6 - suggest binary search (unused)
+ bit 7 - 1 -> (internal) tree of anodes
+ 0 -> (leaf) list of extents */
u8 fill[3];
u8 n_free_nodes; /* free nodes in following array */
u8 n_used_nodes; /* used nodes in following array */
- u16 first_free; /* offset from start of header to
+ __le16 first_free; /* offset from start of header to
first free node in array */
union {
struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
@@ -428,6 +414,16 @@ struct bplus_header
} u;
};
+static inline bool bp_internal(struct bplus_header *bp)
+{
+ return bp->flags & BP_internal;
+}
+
+static inline bool bp_fnode_parent(struct bplus_header *bp)
+{
+ return bp->flags & BP_fnode_parent;
+}
+
/* fnode: root of allocation b+ tree, and EA's */
/* Every file and every directory has one fnode, pointed to by the directory
@@ -436,62 +432,56 @@ struct bplus_header
#define FNODE_MAGIC 0xf7e40aae
+enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)};
struct fnode
{
- u32 magic; /* f7e4 0aae */
- u32 zero1[2]; /* read history */
+ __le32 magic; /* f7e4 0aae */
+ __le32 zero1[2]; /* read history */
u8 len, name[15]; /* true length, truncated name */
- fnode_secno up; /* pointer to file's directory fnode */
- secno acl_size_l;
- secno acl_secno;
- u16 acl_size_s;
+ __le32 up; /* pointer to file's directory fnode */
+ __le32 acl_size_l;
+ __le32 acl_secno;
+ __le16 acl_size_s;
u8 acl_anode;
u8 zero2; /* history bit count */
- u32 ea_size_l; /* length of disk-resident ea's */
- secno ea_secno; /* first sector of disk-resident ea's*/
- u16 ea_size_s; /* length of fnode-resident ea's */
-
-#ifdef __LITTLE_ENDIAN
- u8 flag0: 1;
- u8 ea_anode: 1; /* 1 -> ea_secno is an anode */
- u8 flag234567: 6;
-#else
- u8 flag234567: 6;
- u8 ea_anode: 1; /* 1 -> ea_secno is an anode */
- u8 flag0: 1;
-#endif
+ __le32 ea_size_l; /* length of disk-resident ea's */
+ __le32 ea_secno; /* first sector of disk-resident ea's*/
+ __le16 ea_size_s; /* length of fnode-resident ea's */
-#ifdef __LITTLE_ENDIAN
- u8 dirflag: 1; /* 1 -> directory. first & only extent
- points to dnode. */
- u8 flag9012345: 7;
-#else
- u8 flag9012345: 7;
- u8 dirflag: 1; /* 1 -> directory. first & only extent
+ __le16 flags; /* bit 1 set -> ea_secno is an anode */
+ /* bit 8 set -> directory. first & only extent
points to dnode. */
-#endif
-
struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */
union {
struct bplus_leaf_node external[8];
struct bplus_internal_node internal[12];
} u;
- u32 file_size; /* file length, bytes */
- u32 n_needea; /* number of EA's with NEEDEA set */
+ __le32 file_size; /* file length, bytes */
+ __le32 n_needea; /* number of EA's with NEEDEA set */
u8 user_id[16]; /* unused */
- u16 ea_offs; /* offset from start of fnode
+ __le16 ea_offs; /* offset from start of fnode
to first fnode-resident ea */
u8 dasd_limit_treshhold;
u8 dasd_limit_delta;
- u32 dasd_limit;
- u32 dasd_usage;
+ __le32 dasd_limit;
+ __le32 dasd_usage;
u8 ea[316]; /* zero or more EA's, packed together
with no alignment padding.
(Do not use this name, get here
via fnode + ea_offs. I think.) */
};
+static inline bool fnode_in_anode(struct fnode *p)
+{
+ return (p->flags & FNODE_anode) != 0;
+}
+
+static inline bool fnode_is_dir(struct fnode *p)
+{
+ return (p->flags & FNODE_dir) != 0;
+}
+
/* anode: 99.44% pure allocation tree */
@@ -499,9 +489,9 @@ struct fnode
struct anode
{
- u32 magic; /* 37e4 0aae */
- anode_secno self; /* pointer to this anode */
- secno up; /* parent anode or fnode */
+ __le32 magic; /* 37e4 0aae */
+ __le32 self; /* pointer to this anode */
+ __le32 up; /* parent anode or fnode */
struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */
union {
@@ -509,7 +499,7 @@ struct anode
struct bplus_internal_node internal[60];
} u;
- u32 fill[3]; /* unused */
+ __le32 fill[3]; /* unused */
};
@@ -528,32 +518,23 @@ struct anode
run, or in multiple runs. Flags in the fnode tell whether the EA list
is immediate, in a single run, or in multiple runs. */
+enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 };
struct extended_attribute
{
-#ifdef __LITTLE_ENDIAN
- u8 indirect: 1; /* 1 -> value gives sector number
+ u8 flags; /* bit 0 set -> value gives sector number
where real value starts */
- u8 anode: 1; /* 1 -> sector is an anode
+ /* bit 1 set -> sector is an anode
that points to fragmented value */
- u8 flag23456: 5;
- u8 needea: 1; /* required ea */
-#else
- u8 needea: 1; /* required ea */
- u8 flag23456: 5;
- u8 anode: 1; /* 1 -> sector is an anode
- that points to fragmented value */
- u8 indirect: 1; /* 1 -> value gives sector number
- where real value starts */
-#endif
+ /* bit 7 set -> required ea */
u8 namelen; /* length of name, bytes */
u8 valuelen_lo; /* length of value, bytes */
u8 valuelen_hi; /* length of value, bytes */
- u8 name[0];
+ u8 name[];
/*
u8 name[namelen]; ascii attrib name
u8 nul; terminating '\0', not counted
u8 value[valuelen]; value, arbitrary
- if this.indirect, valuelen is 8 and the value is
+ if this.flags & 1, valuelen is 8 and the value is
u32 length; real length of value, bytes
secno secno; sector address where it starts
if this.anode, the above sector number is the root of an anode tree
@@ -561,6 +542,16 @@ struct extended_attribute
*/
};
+static inline bool ea_indirect(struct extended_attribute *ea)
+{
+ return ea->flags & EA_indirect;
+}
+
+static inline bool ea_in_anode(struct extended_attribute *ea)
+{
+ return ea->flags & EA_anode;
+}
+
/*
Local Variables:
comment-column: 40
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 6d2d5008fa4..c07ef1f1ced 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -75,7 +75,7 @@ struct hpfs_sb_info {
unsigned char *sb_cp_table; /* code page tables: */
/* 128 bytes uppercasing table & */
/* 128 bytes lowercasing table */
- unsigned *sb_bmp_dir; /* main bitmap directory */
+ __le32 *sb_bmp_dir; /* main bitmap directory */
unsigned sb_c_bitmap; /* current bitmap */
unsigned sb_max_fwd_alloc; /* max forwad allocation */
int sb_timeshift;
@@ -93,7 +93,7 @@ struct quad_buffer_head {
static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
{
CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
- return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4));
+ return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4));
}
/* The first dir entry in a dnode */
@@ -141,12 +141,12 @@ static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
static inline secno ea_sec(struct extended_attribute *ea)
{
- return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen)));
+ return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen)));
}
static inline secno ea_len(struct extended_attribute *ea)
{
- return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen)));
+ return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen)));
}
static inline char *ea_data(struct extended_attribute *ea)
@@ -171,7 +171,7 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
dst->not_8x3 = n;
}
-static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n)
+static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n)
{
int i;
if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
@@ -268,10 +268,10 @@ void hpfs_evict_inode(struct inode *);
/* map.c */
-unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
-unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
+__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
unsigned char *hpfs_load_code_page(struct super_block *, secno);
-secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index b43066cbdc6..ed671e0ea78 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -110,7 +110,7 @@ void hpfs_read_inode(struct inode *i)
}
}
}
- if (fnode->dirflag) {
+ if (fnode_is_dir(fnode)) {
int n_dnodes, n_subdirs;
i->i_mode |= S_IFDIR;
i->i_op = &hpfs_dir_iops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a790821366a..4acb19d7835 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -8,12 +8,12 @@
#include "hpfs_fn.h"
-unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
+__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
{
return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
}
-unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
+__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
struct quad_buffer_head *qbh, char *id)
{
secno sec;
@@ -89,18 +89,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
return cp_table;
}
-secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
+__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
{
struct buffer_head *bh;
int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
int i;
- secno *b;
+ __le32 *b;
if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
printk("HPFS: can't allocate memory for bitmap directory\n");
return NULL;
}
for (i=0;i<n;i++) {
- secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
+ __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
if (!d) {
kfree(b);
return NULL;
@@ -130,16 +130,16 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
(unsigned long)ino);
goto bail;
}
- if (!fnode->dirflag) {
+ if (!fnode_is_dir(fnode)) {
if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
- (fnode->btree.internal ? 12 : 8)) {
+ (bp_internal(&fnode->btree) ? 12 : 8)) {
hpfs_error(s,
"bad number of nodes in fnode %08lx",
(unsigned long)ino);
goto bail;
}
if (le16_to_cpu(fnode->btree.first_free) !=
- 8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) {
+ 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) {
hpfs_error(s,
"bad first_free pointer in fnode %08lx",
(unsigned long)ino);
@@ -187,12 +187,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff
goto bail;
}
if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
- (anode->btree.internal ? 60 : 40)) {
+ (bp_internal(&anode->btree) ? 60 : 40)) {
hpfs_error(s, "bad number of nodes in anode %08x", ano);
goto bail;
}
if (le16_to_cpu(anode->btree.first_free) !=
- 8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) {
+ 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) {
hpfs_error(s, "bad first_free pointer in anode %08x", ano);
goto bail;
}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 30dd7b10b50..9083ef8af58 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -70,7 +70,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fnode->len = len;
memcpy(fnode->name, name, len > 15 ? 15 : len);
fnode->up = cpu_to_le32(dir->i_ino);
- fnode->dirflag = 1;
+ fnode->flags |= FNODE_dir;
fnode->btree.n_free_nodes = 7;
fnode->btree.n_used_nodes = 1;
fnode->btree.first_free = cpu_to_le16(0x14);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 54f6eccb79d..706a12c083e 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -572,7 +572,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
mark_buffer_dirty(bh2);
}
- if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) {
+ if (spareblock->hotfixes_used || spareblock->n_spares_used) {
if (errs >= 2) {
printk("HPFS: Hotfixes not supported here, try chkdsk\n");
mark_dirty(s, 0);
@@ -645,7 +645,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
root->i_mtime.tv_nsec = 0;
root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
root->i_ctime.tv_nsec = 0;
- hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size);
+ hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
hpfs_i(root)->i_parent_dir = root->i_ino;
if (root->i_size == -1)
root->i_size = 2048;
diff --git a/fs/inode.c b/fs/inode.c
index c474c1d7062..c99163b1b31 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1487,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
return 0;
}
+/*
+ * This does the actual work of updating an inodes time or version. Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+ if (inode->i_op->update_time)
+ return inode->i_op->update_time(inode, time, flags);
+
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_VERSION)
+ inode_inc_iversion(inode);
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+ mark_inode_dirty_sync(inode);
+ return 0;
+}
+
/**
* touch_atime - update the access time
- * @mnt: mount the inode is accessed on
- * @dentry: dentry accessed
+ * @path: the &struct path to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
@@ -1525,12 +1545,83 @@ void touch_atime(struct path *path)
if (mnt_want_write(mnt))
return;
- inode->i_atime = now;
- mark_inode_dirty_sync(inode);
+ /*
+ * File systems can error out when updating inodes if they need to
+ * allocate new space to modify an inode (such is the case for
+ * Btrfs), but since we touch atime while walking down the path we
+ * really don't care if we failed to update the atime of the file,
+ * so just ignore the return value.
+ */
+ update_time(inode, &now, S_ATIME);
mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
+/*
+ * The logic we want is
+ *
+ * if suid or (sgid and xgrp)
+ * remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+ umode_t mode = dentry->d_inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+static int __remove_suid(struct dentry *dentry, int kill)
+{
+ struct iattr newattrs;
+
+ newattrs.ia_valid = ATTR_FORCE | kill;
+ return notify_change(dentry, &newattrs);
+}
+
+int file_remove_suid(struct file *file)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ int killsuid;
+ int killpriv;
+ int error = 0;
+
+ /* Fast path for nothing security related */
+ if (IS_NOSEC(inode))
+ return 0;
+
+ killsuid = should_remove_suid(dentry);
+ killpriv = security_inode_need_killpriv(dentry);
+
+ if (killpriv < 0)
+ return killpriv;
+ if (killpriv)
+ error = security_inode_killpriv(dentry);
+ if (!error && killsuid)
+ error = __remove_suid(dentry, killsuid);
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+ inode->i_flags |= S_NOSEC;
+
+ return error;
+}
+EXPORT_SYMBOL(file_remove_suid);
+
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
@@ -1540,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime);
* usage in the file write path of filesystems, and filesystems may
* choose to explicitly ignore update via this function with the
* S_NOCMTIME inode flag, e.g. for network filesystem where these
- * timestamps are handled by the server.
+ * timestamps are handled by the server. This can return an error for
+ * file systems who need to allocate space in order to update an inode.
*/
-void file_update_time(struct file *file)
+int file_update_time(struct file *file)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
- enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+ int sync_it = 0;
+ int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
- return;
+ return 0;
now = current_fs_time(inode->i_sb);
if (!timespec_equal(&inode->i_mtime, &now))
@@ -1564,21 +1657,16 @@ void file_update_time(struct file *file)
sync_it |= S_VERSION;
if (!sync_it)
- return;
+ return 0;
/* Finally allowed to write? Takes lock. */
if (mnt_want_write_file(file))
- return;
+ return 0;
- /* Only change inode inside the lock region */
- if (sync_it & S_VERSION)
- inode_inc_iversion(inode);
- if (sync_it & S_CTIME)
- inode->i_ctime = now;
- if (sync_it & S_MTIME)
- inode->i_mtime = now;
- mark_inode_dirty_sync(inode);
+ ret = update_time(inode, &now, sync_it);
mnt_drop_write_file(file);
+
+ return ret;
}
EXPORT_SYMBOL(file_update_time);
diff --git a/fs/internal.h b/fs/internal.h
index 9962c59ba28..18bc216ea09 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
/*
@@ -100,6 +100,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
extern long do_handle_open(int mountdirfd,
struct file_handle __user *ufh, int open_flag);
+extern int open_check_o_direct(struct file *f);
/*
* inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index dd4687ff30d..aa4356d09ee 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -107,12 +107,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
}
static int
-isofs_export_encode_fh(struct dentry *dentry,
+isofs_export_encode_fh(struct inode *inode,
__u32 *fh32,
int *max_len,
- int connectable)
+ struct inode *parent)
{
- struct inode * inode = dentry->d_inode;
struct iso_inode_info * ei = ISOFS_I(inode);
int len = *max_len;
int type = 1;
@@ -124,7 +123,7 @@ isofs_export_encode_fh(struct dentry *dentry,
* offset of the inode and the upper 16 bits of fh32[1] to
* hold the offset of the parent.
*/
- if (connectable && (len < 5)) {
+ if (parent && (len < 5)) {
*max_len = 5;
return 255;
} else if (len < 3) {
@@ -136,16 +135,12 @@ isofs_export_encode_fh(struct dentry *dentry,
fh32[0] = ei->i_iget5_block;
fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */
fh32[2] = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
+ if (parent) {
struct iso_inode_info *eparent;
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent->d_inode;
eparent = ISOFS_I(parent);
fh32[3] = eparent->i_iget5_block;
fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */
fh32[4] = parent->i_generation;
- spin_unlock(&dentry->d_lock);
len = 5;
type = 2;
}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index f32f346f4b0..69a48c2944d 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,6 +1,8 @@
config JBD2
tristate
select CRC32
+ select CRYPTO
+ select CRYPTO_CRC32C
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 840f70f5079..216f4299f65 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -85,6 +85,24 @@ nope:
__brelse(bh);
}
+static void jbd2_commit_block_csum_set(journal_t *j,
+ struct journal_head *descriptor)
+{
+ struct commit_header *h;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+ h->h_chksum_type = 0;
+ h->h_chksum_size = 0;
+ h->h_chksum[0] = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+ j->j_blocksize);
+ h->h_chksum[0] = cpu_to_be32(csum);
+}
+
/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
@@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
+ jbd2_commit_block_csum_set(journal, descriptor);
JBUFFER_TRACE(descriptor, "submit commit block");
lock_buffer(bh);
@@ -301,6 +320,44 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
+static void jbd2_descr_block_csum_set(journal_t *j,
+ struct journal_head *descriptor)
+{
+ struct jbd2_journal_block_tail *tail;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ tail = (struct jbd2_journal_block_tail *)
+ (jh2bh(descriptor)->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ tail->t_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+ j->j_blocksize);
+ tail->t_checksum = cpu_to_be32(csum);
+}
+
+static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
+ struct buffer_head *bh, __u32 sequence)
+{
+ struct page *page = bh->b_page;
+ __u8 *addr;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ sequence = cpu_to_be32(sequence);
+ addr = kmap_atomic(page, KM_USER0);
+ csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
+ bh->b_size);
+ kunmap_atomic(addr, KM_USER0);
+
+ tag->t_checksum = cpu_to_be32(csum);
+}
/*
* jbd2_journal_commit_transaction
*
@@ -334,6 +391,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
unsigned long first_block;
tid_t first_tid;
int update_tail;
+ int csum_size = 0;
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ csum_size = sizeof(struct jbd2_journal_block_tail);
/*
* First job: lock down the current transaction and wait for
@@ -627,7 +688,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
- tag->t_flags = cpu_to_be32(tag_flag);
+ tag->t_flags = cpu_to_be16(tag_flag);
+ jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+ commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
@@ -643,7 +706,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (bufs == journal->j_wbufsize ||
commit_transaction->t_buffers == NULL ||
- space_left < tag_bytes + 16) {
+ space_left < tag_bytes + 16 + csum_size) {
jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
@@ -651,8 +714,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
submitting the IOs. "tag" still points to
the last tag we set up. */
- tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
+ jbd2_descr_block_csum_set(journal, descriptor);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1afb701622b..e9a3c4c8559 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,6 +97,43 @@ EXPORT_SYMBOL(jbd2_inode_cache);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+/* Checksumming functions */
+int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
+}
+
+static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+{
+ __u32 csum, old_csum;
+
+ old_csum = sb->s_checksum;
+ sb->s_checksum = 0;
+ csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+ sb->s_checksum = old_csum;
+
+ return cpu_to_be32(csum);
+}
+
+int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ return sb->s_checksum == jbd2_superblock_csum(j, sb);
+}
+
+void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ sb->s_checksum = jbd2_superblock_csum(j, sb);
+}
+
/*
* Helper function used to manage commit timeouts
*/
@@ -1348,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
journal->j_errno);
sb->s_errno = cpu_to_be32(journal->j_errno);
+ jbd2_superblock_csum_set(journal, sb);
read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, WRITE_SYNC);
@@ -1376,6 +1414,9 @@ static int journal_get_superblock(journal_t *journal)
}
}
+ if (buffer_verified(bh))
+ return 0;
+
sb = journal->j_superblock;
err = -EINVAL;
@@ -1413,6 +1454,43 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
+ if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ /* Can't have checksum v1 and v2 on at the same time! */
+ printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+ "at the same time!\n");
+ goto out;
+ }
+
+ if (!jbd2_verify_csum_type(journal, sb)) {
+ printk(KERN_ERR "JBD: Unknown checksum type\n");
+ goto out;
+ }
+
+ /* Load the checksum driver */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+ err = PTR_ERR(journal->j_chksum_driver);
+ journal->j_chksum_driver = NULL;
+ goto out;
+ }
+ }
+
+ /* Check superblock checksum */
+ if (!jbd2_superblock_csum_verify(journal, sb)) {
+ printk(KERN_ERR "JBD: journal checksum error\n");
+ goto out;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+
+ set_buffer_verified(bh);
+
return 0;
out:
@@ -1564,6 +1642,8 @@ int jbd2_journal_destroy(journal_t *journal)
iput(journal->j_inode);
if (journal->j_revoke)
jbd2_journal_destroy_revoke(journal);
+ if (journal->j_chksum_driver)
+ crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
kfree(journal);
@@ -1653,6 +1733,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
+#define INCOMPAT_FEATURE_ON(f) \
+ ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
+#define COMPAT_FEATURE_ON(f) \
+ ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
journal_superblock_t *sb;
if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
@@ -1661,16 +1745,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
return 0;
+ /* Asking for checksumming v2 and v1? Only give them v2. */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+ compat & JBD2_FEATURE_COMPAT_CHECKSUM)
+ compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
+
jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
+ /* If enabling v2 checksums, update superblock */
+ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
+ sb->s_feature_compat &=
+ ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+
+ /* Load the checksum driver */
+ if (journal->j_chksum_driver == NULL) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c",
+ 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD: Cannot load crc32c "
+ "driver.\n");
+ journal->j_chksum_driver = NULL;
+ return 0;
+ }
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0,
+ sb->s_uuid,
+ sizeof(sb->s_uuid));
+ }
+
+ /* If enabling v1 checksums, downgrade superblock */
+ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
+ sb->s_feature_incompat &=
+ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
return 1;
+#undef COMPAT_FEATURE_ON
+#undef INCOMPAT_FEATURE_ON
}
/*
@@ -1975,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
*/
size_t journal_tag_bytes(journal_t *journal)
{
+ journal_block_tag_t tag;
+ size_t x = 0;
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ x += sizeof(tag.t_checksum);
+
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
- return JBD2_TAG_SIZE64;
+ return x + JBD2_TAG_SIZE64;
else
- return JBD2_TAG_SIZE32;
+ return x + JBD2_TAG_SIZE32;
}
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index c1a03354a22..0131e436253 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return 0;
}
+static int jbd2_descr_block_csum_verify(journal_t *j,
+ void *buf)
+{
+ struct jbd2_journal_block_tail *tail;
+ __u32 provided, calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ provided = tail->t_checksum;
+ tail->t_checksum = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ tail->t_checksum = provided;
+
+ provided = be32_to_cpu(provided);
+ return provided == calculated;
+}
/*
* Count the number of in-use tags in a journal descriptor block.
@@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ size -= sizeof(struct jbd2_journal_block_tail);
+
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
@@ -193,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
nr++;
tagp += tag_bytes;
- if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+ if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
tagp += 16;
- if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+ if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
break;
}
@@ -353,6 +375,41 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
return 0;
}
+static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
+{
+ struct commit_header *h;
+ __u32 provided, calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ h = buf;
+ provided = h->h_chksum[0];
+ h->h_chksum[0] = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ h->h_chksum[0] = provided;
+
+ provided = be32_to_cpu(provided);
+ return provided == calculated;
+}
+
+static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+ void *buf, __u32 sequence)
+{
+ __u32 provided, calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ sequence = cpu_to_be32(sequence);
+ calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
+ provided = be32_to_cpu(tag->t_checksum);
+
+ return provided == cpu_to_be32(calculated);
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -366,6 +423,7 @@ static int do_one_pass(journal_t *journal,
int blocktype;
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
+ int descr_csum_size = 0;
/*
* First thing is to establish what we expect to find in the log
@@ -451,6 +509,18 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
+ /* Verify checksum first */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ descr_csum_size =
+ sizeof(struct jbd2_journal_block_tail);
+ if (descr_csum_size > 0 &&
+ !jbd2_descr_block_csum_verify(journal,
+ bh->b_data)) {
+ err = -EIO;
+ goto failed;
+ }
+
/* If it is a valid descriptor block, replay it
* in pass REPLAY; if journal_checksums enabled, then
* calculate checksums in PASS_SCAN, otherwise,
@@ -481,11 +551,11 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize) {
+ <= journal->j_blocksize - descr_csum_size) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
- flags = be32_to_cpu(tag->t_flags);
+ flags = be16_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
@@ -516,6 +586,19 @@ static int do_one_pass(journal_t *journal,
goto skip_write;
}
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(
+ journal, tag, obh->b_data,
+ be32_to_cpu(tmp->h_sequence))) {
+ brelse(obh);
+ success = -EIO;
+ printk(KERN_ERR "JBD: Invalid "
+ "checksum recovering "
+ "block %llu in log\n",
+ blocknr);
+ continue;
+ }
+
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
@@ -650,6 +733,19 @@ static int do_one_pass(journal_t *journal,
}
crc32_sum = ~0;
}
+ if (pass == PASS_SCAN &&
+ !jbd2_commit_block_csum_verify(journal,
+ bh->b_data)) {
+ info->end_transaction = next_commit_ID;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ journal->j_failed_commit =
+ next_commit_ID;
+ brelse(bh);
+ break;
+ }
+ }
brelse(bh);
next_commit_ID++;
continue;
@@ -706,6 +802,25 @@ static int do_one_pass(journal_t *journal,
return err;
}
+static int jbd2_revoke_block_csum_verify(journal_t *j,
+ void *buf)
+{
+ struct jbd2_journal_revoke_tail *tail;
+ __u32 provided, calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+ sizeof(struct jbd2_journal_revoke_tail));
+ provided = tail->r_checksum;
+ tail->r_checksum = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ tail->r_checksum = provided;
+
+ provided = be32_to_cpu(provided);
+ return provided == calculated;
+}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
@@ -720,6 +835,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
+ if (!jbd2_revoke_block_csum_verify(journal, header))
+ return -EINVAL;
+
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
record_len = 8;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 6973705d6a3..f30b80b4ce8 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal,
struct jbd2_revoke_record_s *record,
int write_op)
{
+ int csum_size = 0;
struct journal_head *descriptor;
int offset;
journal_header_t *header;
@@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = *descriptorp;
offset = *offsetp;
+ /* Do we need to leave space at the end for a checksum? */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ csum_size = sizeof(struct jbd2_journal_revoke_tail);
+
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
- if (offset == journal->j_blocksize) {
+ if (offset >= journal->j_blocksize - csum_size) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
@@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
+static void jbd2_revoke_csum_set(journal_t *j,
+ struct journal_head *descriptor)
+{
+ struct jbd2_journal_revoke_tail *tail;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ tail = (struct jbd2_journal_revoke_tail *)
+ (jh2bh(descriptor)->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_revoke_tail));
+ tail->r_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+ j->j_blocksize);
+ tail->r_checksum = cpu_to_be32(csum);
+}
+
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
@@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal,
header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
header->r_count = cpu_to_be32(offset);
+ jbd2_revoke_csum_set(journal, descriptor);
+
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ddcd3549c6c..fb1ab9533b6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = kmem_cache_alloc(transaction_cache,
- gfp_mask | __GFP_ZERO);
+ new_transaction = kmem_cache_zalloc(transaction_cache,
+ gfp_mask);
if (!new_transaction) {
/*
* If __GFP_FS is not present, then we may be
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 55a0c1dcead..44dca1f041c 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -126,6 +126,10 @@ struct jffs2_sb_info {
struct jffs2_inodirty *wbuf_inodes;
struct rw_semaphore wbuf_sem; /* Protects the write buffer */
+ struct delayed_work wbuf_dwork; /* write-buffer write-out work */
+ int wbuf_queued; /* non-zero delayed work is queued */
+ spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */
+
unsigned char *oobbuf;
int oobavail; /* How many bytes are available for JFFS2 in OOB */
#endif
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 1cd3aec9d9a..bcd983d7e7f 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
#define jffs2_ubivol(c) (0)
#define jffs2_ubivol_setup(c) (0)
#define jffs2_ubivol_cleanup(c) do {} while (0)
+#define jffs2_dirty_trigger(c) do {} while (0)
#else /* NAND and/or ECC'd NOR support present */
@@ -135,14 +136,10 @@ void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
+void jffs2_dirty_trigger(struct jffs2_sb_info *c);
#endif /* WRITEBUFFER */
-static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-{
- OFNI_BS_2SFFJ(c)->s_dirt = 1;
-}
-
/* background.c */
int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f9916f312bd..bc586f20422 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,21 +63,6 @@ static void jffs2_i_init_once(void *foo)
inode_init_once(&f->vfs_inode);
}
-static void jffs2_write_super(struct super_block *sb)
-{
- struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-
- lock_super(sb);
- sb->s_dirt = 0;
-
- if (!(sb->s_flags & MS_RDONLY)) {
- jffs2_dbg(1, "%s()\n", __func__);
- jffs2_flush_wbuf_gc(c, 0);
- }
-
- unlock_super(sb);
-}
-
static const char *jffs2_compr_name(unsigned int compr)
{
switch (compr) {
@@ -113,8 +98,6 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
- jffs2_write_super(sb);
-
mutex_lock(&c->alloc_sem);
jffs2_flush_wbuf_pad(c);
mutex_unlock(&c->alloc_sem);
@@ -251,7 +234,6 @@ static const struct super_operations jffs2_super_operations =
.alloc_inode = jffs2_alloc_inode,
.destroy_inode =jffs2_destroy_inode,
.put_super = jffs2_put_super,
- .write_super = jffs2_write_super,
.statfs = jffs2_statfs,
.remount_fs = jffs2_remount_fs,
.evict_inode = jffs2_evict_inode,
@@ -319,9 +301,6 @@ static void jffs2_put_super (struct super_block *sb)
jffs2_dbg(2, "%s()\n", __func__);
- if (sb->s_dirt)
- jffs2_write_super(sb);
-
mutex_lock(&c->alloc_sem);
jffs2_flush_wbuf_pad(c);
mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 74d9be19df3..6f4529d3697 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -20,6 +20,7 @@
#include <linux/mtd/nand.h>
#include <linux/jiffies.h>
#include <linux/sched.h>
+#include <linux/writeback.h>
#include "nodelist.h"
@@ -85,7 +86,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
{
struct jffs2_inodirty *new;
- /* Mark the superblock dirty so that kupdated will flush... */
+ /* Schedule delayed write-buffer write-out */
jffs2_dirty_trigger(c);
if (jffs2_wbuf_pending_for_ino(c, ino))
@@ -1148,6 +1149,47 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
return 1;
}
+static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+
+ dwork = container_of(work, struct delayed_work, work);
+ return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
+}
+
+static void delayed_wbuf_sync(struct work_struct *work)
+{
+ struct jffs2_sb_info *c = work_to_sb(work);
+ struct super_block *sb = OFNI_BS_2SFFJ(c);
+
+ spin_lock(&c->wbuf_dwork_lock);
+ c->wbuf_queued = 0;
+ spin_unlock(&c->wbuf_dwork_lock);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ jffs2_dbg(1, "%s()\n", __func__);
+ jffs2_flush_wbuf_gc(c, 0);
+ }
+}
+
+void jffs2_dirty_trigger(struct jffs2_sb_info *c)
+{
+ struct super_block *sb = OFNI_BS_2SFFJ(c);
+ unsigned long delay;
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ spin_lock(&c->wbuf_dwork_lock);
+ if (!c->wbuf_queued) {
+ jffs2_dbg(1, "%s()\n", __func__);
+ delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+ queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
+ c->wbuf_queued = 1;
+ }
+ spin_unlock(&c->wbuf_dwork_lock);
+}
+
int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
{
struct nand_ecclayout *oinfo = c->mtd->ecclayout;
@@ -1169,6 +1211,8 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
/* Initialise write buffer */
init_rwsem(&c->wbuf_sem);
+ spin_lock_init(&c->wbuf_dwork_lock);
+ INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
c->wbuf_ofs = 0xFFFFFFFF;
@@ -1207,8 +1251,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
-
-
+ spin_lock_init(&c->wbuf_dwork_lock);
+ INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->erasesize;
/* Find a suitable c->sector_size
@@ -1267,6 +1311,9 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
+ spin_lock_init(&c->wbuf_dwork_lock);
+ INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+
c->wbuf_pagesize = c->mtd->writesize;
c->wbuf_ofs = 0xFFFFFFFF;
@@ -1299,6 +1346,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
return 0;
init_rwsem(&c->wbuf_sem);
+ spin_lock_init(&c->wbuf_dwork_lock);
+ INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
c->wbuf_ofs = 0xFFFFFFFF;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1ead0750cdb..80938fda67e 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -251,39 +251,40 @@ out_err:
return err;
}
-static int lockd_up_net(struct net *net)
+static int lockd_up_net(struct svc_serv *serv, struct net *net)
{
struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct svc_serv *serv = nlmsvc_rqst->rq_server;
int error;
- if (ln->nlmsvc_users)
+ if (ln->nlmsvc_users++)
return 0;
- error = svc_rpcb_setup(serv, net);
+ error = svc_bind(serv, net);
if (error)
- goto err_rpcb;
+ goto err_bind;
error = make_socks(serv, net);
if (error < 0)
goto err_socks;
+ dprintk("lockd_up_net: per-net data created; net=%p\n", net);
return 0;
err_socks:
svc_rpcb_cleanup(serv, net);
-err_rpcb:
+err_bind:
+ ln->nlmsvc_users--;
return error;
}
-static void lockd_down_net(struct net *net)
+static void lockd_down_net(struct svc_serv *serv, struct net *net)
{
struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct svc_serv *serv = nlmsvc_rqst->rq_server;
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
svc_shutdown_net(serv, net);
+ dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
}
} else {
printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
@@ -292,21 +293,60 @@ static void lockd_down_net(struct net *net)
}
}
-/*
- * Bring up the lockd process if it's not already up.
- */
-int lockd_up(struct net *net)
+static int lockd_start_svc(struct svc_serv *serv)
+{
+ int error;
+
+ if (nlmsvc_rqst)
+ return 0;
+
+ /*
+ * Create the kernel thread and wait for it to start.
+ */
+ nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+ if (IS_ERR(nlmsvc_rqst)) {
+ error = PTR_ERR(nlmsvc_rqst);
+ printk(KERN_WARNING
+ "lockd_up: svc_rqst allocation failed, error=%d\n",
+ error);
+ goto out_rqst;
+ }
+
+ svc_sock_update_bufs(serv);
+ serv->sv_maxconn = nlm_max_connections;
+
+ nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+ if (IS_ERR(nlmsvc_task)) {
+ error = PTR_ERR(nlmsvc_task);
+ printk(KERN_WARNING
+ "lockd_up: kthread_run failed, error=%d\n", error);
+ goto out_task;
+ }
+ dprintk("lockd_up: service started\n");
+ return 0;
+
+out_task:
+ svc_exit_thread(nlmsvc_rqst);
+ nlmsvc_task = NULL;
+out_rqst:
+ nlmsvc_rqst = NULL;
+ return error;
+}
+
+static struct svc_serv *lockd_create_svc(void)
{
struct svc_serv *serv;
- int error = 0;
- mutex_lock(&nlmsvc_mutex);
/*
* Check whether we're already up and running.
*/
if (nlmsvc_rqst) {
- error = lockd_up_net(net);
- goto out;
+ /*
+ * Note: increase service usage, because later in case of error
+ * svc_destroy() will be called.
+ */
+ svc_get(nlmsvc_rqst->rq_server);
+ return nlmsvc_rqst->rq_server;
}
/*
@@ -317,59 +357,53 @@ int lockd_up(struct net *net)
printk(KERN_WARNING
"lockd_up: no pid, %d users??\n", nlmsvc_users);
- error = -ENOMEM;
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
- goto out;
+ return ERR_PTR(-ENOMEM);
}
+ dprintk("lockd_up: service created\n");
+ return serv;
+}
- error = make_socks(serv, net);
- if (error < 0)
- goto destroy_and_out;
+/*
+ * Bring up the lockd process if it's not already up.
+ */
+int lockd_up(struct net *net)
+{
+ struct svc_serv *serv;
+ int error;
- /*
- * Create the kernel thread and wait for it to start.
- */
- nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- if (IS_ERR(nlmsvc_rqst)) {
- error = PTR_ERR(nlmsvc_rqst);
- nlmsvc_rqst = NULL;
- printk(KERN_WARNING
- "lockd_up: svc_rqst allocation failed, error=%d\n",
- error);
- goto destroy_and_out;
+ mutex_lock(&nlmsvc_mutex);
+
+ serv = lockd_create_svc();
+ if (IS_ERR(serv)) {
+ error = PTR_ERR(serv);
+ goto err_create;
}
- svc_sock_update_bufs(serv);
- serv->sv_maxconn = nlm_max_connections;
+ error = lockd_up_net(serv, net);
+ if (error < 0)
+ goto err_net;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
- if (IS_ERR(nlmsvc_task)) {
- error = PTR_ERR(nlmsvc_task);
- svc_exit_thread(nlmsvc_rqst);
- nlmsvc_task = NULL;
- nlmsvc_rqst = NULL;
- printk(KERN_WARNING
- "lockd_up: kthread_run failed, error=%d\n", error);
- goto destroy_and_out;
- }
+ error = lockd_start_svc(serv);
+ if (error < 0)
+ goto err_start;
+ nlmsvc_users++;
/*
* Note: svc_serv structures have an initial use count of 1,
* so we exit through here on both success and failure.
*/
-destroy_and_out:
+err_net:
svc_destroy(serv);
-out:
- if (!error) {
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- ln->nlmsvc_users++;
- nlmsvc_users++;
- }
+err_create:
mutex_unlock(&nlmsvc_mutex);
return error;
+
+err_start:
+ lockd_down_net(serv, net);
+ goto err_net;
}
EXPORT_SYMBOL_GPL(lockd_up);
@@ -380,11 +414,10 @@ void
lockd_down(struct net *net)
{
mutex_lock(&nlmsvc_mutex);
+ lockd_down_net(nlmsvc_rqst->rq_server, net);
if (nlmsvc_users) {
- if (--nlmsvc_users) {
- lockd_down_net(net);
+ if (--nlmsvc_users)
goto out;
- }
} else {
printk(KERN_ERR "lockd_down: no users! task=%p\n",
nlmsvc_task);
@@ -396,7 +429,9 @@ lockd_down(struct net *net)
BUG();
}
kthread_stop(nlmsvc_task);
+ dprintk("lockd_down: service stopped\n");
svc_exit_thread(nlmsvc_rqst);
+ dprintk("lockd_down: service destroyed\n");
nlmsvc_task = NULL;
nlmsvc_rqst = NULL;
out:
diff --git a/fs/locks.c b/fs/locks.c
index 4f441e46cef..814c51d0de4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1636,12 +1636,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
struct file *filp;
+ int fput_needed;
struct file_lock *lock;
int can_sleep, unlock;
int error;
error = -EBADF;
- filp = fget(fd);
+ filp = fget_light(fd, &fput_needed);
if (!filp)
goto out;
@@ -1674,7 +1675,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
locks_free_lock(lock);
out_putf:
- fput(filp);
+ fput_light(filp, fput_needed);
out:
return error;
}
diff --git a/fs/namei.c b/fs/namei.c
index c651f02c9fe..7d694194024 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
mntget(nd->path.mnt);
rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
nd->flags &= ~LOOKUP_RCU;
return 0;
@@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd)
if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
spin_unlock(&dentry->d_lock);
rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return -ECHILD;
}
BUG_ON(nd->inode != dentry->d_inode);
spin_unlock(&dentry->d_lock);
mntget(nd->path.mnt);
rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -681,15 +681,15 @@ int follow_up(struct path *path)
struct mount *parent;
struct dentry *mountpoint;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
parent = mnt->mnt_parent;
if (&parent->mnt == path->mnt) {
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
@@ -947,7 +947,7 @@ failed:
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return -ECHILD;
}
@@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
* small and for now I'd prefer to have fast path as straight as possible.
* It _is_ time-critical.
*/
-static int do_lookup(struct nameidata *nd, struct qstr *name,
- struct path *path, struct inode **inode)
+static int lookup_fast(struct nameidata *nd, struct qstr *name,
+ struct path *path, struct inode **inode)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
@@ -1208,7 +1208,7 @@ unlazy:
goto need_lookup;
}
}
-done:
+
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd->flags);
@@ -1222,6 +1222,17 @@ done:
return 0;
need_lookup:
+ return 1;
+}
+
+/* Fast lookup failed, do it the slow way */
+static int lookup_slow(struct nameidata *nd, struct qstr *name,
+ struct path *path)
+{
+ struct dentry *dentry, *parent;
+ int err;
+
+ parent = nd->path.dentry;
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
@@ -1229,7 +1240,16 @@ need_lookup:
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- goto done;
+ path->mnt = nd->path.mnt;
+ path->dentry = dentry;
+ err = follow_managed(path, nd->flags);
+ if (unlikely(err < 0)) {
+ path_put_conditional(path, nd);
+ return err;
+ }
+ if (err)
+ nd->flags |= LOOKUP_JUMPED;
+ return 0;
}
static inline int may_lookup(struct nameidata *nd)
@@ -1265,7 +1285,7 @@ static void terminate_walk(struct nameidata *nd)
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
}
}
@@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
*/
if (unlikely(type != LAST_NORM))
return handle_dots(nd, type);
- err = do_lookup(nd, name, path, &inode);
+ err = lookup_fast(nd, name, path, &inode);
if (unlikely(err)) {
- terminate_walk(nd);
- return err;
- }
- if (!inode) {
- path_to_nameidata(path, nd);
- terminate_walk(nd);
- return -ENOENT;
+ if (err < 0)
+ goto out_err;
+
+ err = lookup_slow(nd, name, path);
+ if (err < 0)
+ goto out_err;
+
+ inode = path->dentry->d_inode;
}
+ err = -ENOENT;
+ if (!inode)
+ goto out_path_put;
+
if (should_follow_link(inode, follow)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
- terminate_walk(nd);
- return -ECHILD;
+ err = -ECHILD;
+ goto out_err;
}
}
BUG_ON(inode != path->dentry->d_inode);
@@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
path_to_nameidata(path, nd);
nd->inode = inode;
return 0;
+
+out_path_put:
+ path_to_nameidata(path, nd);
+out_err:
+ terminate_walk(nd);
+ return err;
}
/*
@@ -1620,7 +1651,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
@@ -1633,7 +1664,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (*name=='/') {
if (flags & LOOKUP_RCU) {
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
rcu_read_lock();
set_root_rcu(nd);
} else {
@@ -1646,7 +1677,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
struct fs_struct *fs = current->fs;
unsigned seq;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
rcu_read_lock();
do {
@@ -1682,7 +1713,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (fput_needed)
*fp = file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
rcu_read_lock();
} else {
path_get(&file->f_path);
@@ -2169,6 +2200,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
int want_write = 0;
int acc_mode = op->acc_mode;
struct file *filp;
+ struct inode *inode;
+ int symlink_ok = 0;
+ struct path save_parent = { .dentry = NULL, .mnt = NULL };
+ bool retried = false;
int error;
nd->flags &= ~LOOKUP_PARENT;
@@ -2200,30 +2235,23 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
if (!(open_flag & O_CREAT)) {
- int symlink_ok = 0;
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
symlink_ok = 1;
/* we _can_ be in RCU mode here */
- error = walk_component(nd, path, &nd->last, LAST_NORM,
- !symlink_ok);
- if (error < 0)
- return ERR_PTR(error);
- if (error) /* symlink */
- return NULL;
- /* sayonara */
- error = complete_walk(nd);
- if (error)
- return ERR_PTR(error);
+ error = lookup_fast(nd, &nd->last, path, &inode);
+ if (unlikely(error)) {
+ if (error < 0)
+ goto exit;
- error = -ENOTDIR;
- if (nd->flags & LOOKUP_DIRECTORY) {
- if (!nd->inode->i_op->lookup)
+ error = lookup_slow(nd, &nd->last, path);
+ if (error < 0)
goto exit;
+
+ inode = path->dentry->d_inode;
}
- audit_inode(pathname, nd->path.dentry);
- goto ok;
+ goto finish_lookup;
}
/* create side of things */
@@ -2241,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (nd->last.name[nd->last.len])
goto exit;
+retry_lookup:
mutex_lock(&dir->d_inode->i_mutex);
dentry = lookup_hash(nd);
@@ -2302,22 +2331,49 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (error)
nd->flags |= LOOKUP_JUMPED;
+ BUG_ON(nd->flags & LOOKUP_RCU);
+ inode = path->dentry->d_inode;
+finish_lookup:
+ /* we _can_ be in RCU mode here */
error = -ENOENT;
- if (!path->dentry->d_inode)
- goto exit_dput;
+ if (!inode) {
+ path_to_nameidata(path, nd);
+ goto exit;
+ }
- if (path->dentry->d_inode->i_op->follow_link)
+ if (should_follow_link(inode, !symlink_ok)) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlikely(unlazy_walk(nd, path->dentry))) {
+ error = -ECHILD;
+ goto exit;
+ }
+ }
+ BUG_ON(inode != path->dentry->d_inode);
return NULL;
+ }
- path_to_nameidata(path, nd);
- nd->inode = path->dentry->d_inode;
+ if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
+ path_to_nameidata(path, nd);
+ } else {
+ save_parent.dentry = nd->path.dentry;
+ save_parent.mnt = mntget(path->mnt);
+ nd->path.dentry = path->dentry;
+
+ }
+ nd->inode = inode;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
- if (error)
+ if (error) {
+ path_put(&save_parent);
return ERR_PTR(error);
+ }
error = -EISDIR;
- if (S_ISDIR(nd->inode->i_mode))
+ if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+ goto exit;
+ error = -ENOTDIR;
+ if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
goto exit;
+ audit_inode(pathname, nd->path.dentry);
ok:
if (!S_ISREG(nd->inode->i_mode))
will_truncate = 0;
@@ -2333,6 +2389,20 @@ common:
if (error)
goto exit;
filp = nameidata_to_filp(nd);
+ if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) {
+ BUG_ON(save_parent.dentry != dir);
+ path_put(&nd->path);
+ nd->path = save_parent;
+ nd->inode = dir->d_inode;
+ save_parent.mnt = NULL;
+ save_parent.dentry = NULL;
+ if (want_write) {
+ mnt_drop_write(nd->path.mnt);
+ want_write = 0;
+ }
+ retried = true;
+ goto retry_lookup;
+ }
if (!IS_ERR(filp)) {
error = ima_file_check(filp, op->acc_mode);
if (error) {
@@ -2352,7 +2422,8 @@ common:
out:
if (want_write)
mnt_drop_write(nd->path.mnt);
- path_put(&nd->path);
+ path_put(&save_parent);
+ terminate_walk(nd);
return filp;
exit_mutex_unlock:
@@ -2415,6 +2486,12 @@ out:
if (base)
fput(base);
release_open_intent(nd);
+ if (filp == ERR_PTR(-EOPENSTALE)) {
+ if (flags & LOOKUP_RCU)
+ filp = ERR_PTR(-ECHILD);
+ else
+ filp = ERR_PTR(-ESTALE);
+ }
return filp;
out_filp:
diff --git a/fs/namespace.c b/fs/namespace.c
index e6081996c9a..1e4a5fe3d7b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct mount *mnt)
{
int ret = 0;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct mount *mnt)
*/
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return ret;
}
static void __mnt_unmake_readonly(struct mount *mnt)
{
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
mnt->mnt.mnt_flags &= ~MNT_READONLY;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return err;
}
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path *path)
{
struct mount *child_mnt;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
if (child_mnt) {
mnt_add_count(child_mnt, 1);
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return &child_mnt->mnt;
} else {
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return NULL;
}
}
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
mnt->mnt.mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return &mnt->mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct mount *mnt)
{
put_again:
#ifdef CONFIG_SMP
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
if (likely(atomic_read(&mnt->mnt_longterm))) {
mnt_add_count(mnt, -1);
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return;
}
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return;
}
#else
mnt_add_count(mnt, -1);
if (likely(mnt_get_count(mnt)))
return;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
#endif
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
acct_auto_close_mnt(&mnt->mnt);
goto put_again;
}
+
list_del(&mnt->mnt_instance);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
mntfree(mnt);
}
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
{
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
real_mount(mnt)->mnt_pinned++;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
if (mnt->mnt_pinned) {
mnt_add_count(mnt, 1);
mnt->mnt_pinned--;
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
EXPORT_SYMBOL(mnt_unpin);
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
BUG_ON(!m);
/* write lock needed for mnt_get_count */
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += mnt_get_count(p);
minimum_refs += 2;
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
if (actual_refs > minimum_refs)
return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
if (propagate_mount_busy(real_mount(mnt), 2))
ret = 0;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_read(&namespace_sem);
return ret;
}
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *head)
struct dentry *dentry;
struct mount *m;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
dentry = mnt->mnt_mountpoint;
m = mnt->mnt_parent;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
m->mnt_ghosts--;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
dput(dentry);
mntput(&m->mnt);
}
@@ -1073,8 +1074,9 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
+ if (p->mnt_ns)
+ __mnt_make_shortterm(p);
p->mnt_ns = NULL;
- __mnt_make_shortterm(p);
list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
p->mnt_parent->mnt_ghosts++;
@@ -1112,12 +1114,12 @@ static int do_umount(struct mount *mnt, int flags)
* probably don't strictly need the lock here if we examined
* all race cases, but it's a slowpath.
*/
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
if (mnt_get_count(mnt) != 2) {
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return -EBUSY;
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
if (!xchg(&mnt->mnt_expiry_mark, 1))
return -EAGAIN;
@@ -1159,7 +1161,7 @@ static int do_umount(struct mount *mnt, int flags)
}
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
event++;
if (!(flags & MNT_DETACH))
@@ -1171,7 +1173,7 @@ static int do_umount(struct mount *mnt, int flags)
umount_tree(mnt, 1, &umount_list);
retval = 0;
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
return retval;
@@ -1286,19 +1288,19 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
q = clone_mnt(p, p->mnt.mnt_root, flag);
if (!q)
goto Enomem;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &path);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
}
return res;
Enomem:
if (res) {
LIST_HEAD(umount_list);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
umount_tree(res, 0, &umount_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
release_mounts(&umount_list);
}
return NULL;
@@ -1318,9 +1320,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
LIST_HEAD(umount_list);
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
umount_tree(real_mount(mnt), 0, &umount_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
}
@@ -1448,7 +1450,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
if (err)
goto out_cleanup_ids;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1469,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
list_del_init(&child->mnt_hash);
commit_tree(child);
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
return 0;
@@ -1565,10 +1567,10 @@ static int do_change_type(struct path *path, int flag)
goto out_unlock;
}
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
out_unlock:
up_write(&namespace_sem);
@@ -1617,9 +1619,9 @@ static int do_loopback(struct path *path, char *old_name,
err = graft_tree(mnt, path);
if (err) {
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
out2:
unlock_mount(path);
@@ -1677,16 +1679,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
mnt->mnt.mnt_flags = mnt_flags;
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
up_write(&sb->s_umount);
if (!err) {
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
touch_mnt_namespace(mnt->mnt_ns);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
}
return err;
}
@@ -1893,9 +1895,9 @@ fail:
/* remove m from any expiration list it may be on */
if (!list_empty(&mnt->mnt_expire)) {
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_del_init(&mnt->mnt_expire);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
}
mntput(m);
@@ -1911,11 +1913,11 @@ fail:
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1937,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
return;
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -1954,7 +1956,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1, &umounts);
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umounts);
@@ -2218,9 +2220,9 @@ void mnt_make_shortterm(struct vfsmount *m)
struct mount *mnt = real_mount(m);
if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
return;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
atomic_dec(&mnt->mnt_longterm);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
#endif
}
@@ -2250,9 +2252,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
return ERR_PTR(-ENOMEM);
}
new_ns->root = new;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new->mnt_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2418,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
int path_is_under(struct path *path1, struct path *path2)
{
int res;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return res;
}
EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2507,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
goto out4;
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
detach_mnt(new_mnt, &parent_path);
detach_mnt(root_mnt, &root_parent);
/* mount old root on put_old */
@@ -2513,7 +2515,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount new_root on / */
attach_mnt(new_mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
chroot_fs_refs(&root, &new);
error = 0;
out4:
@@ -2576,7 +2578,7 @@ void __init mnt_init(void)
for (u = 0; u < HASH_SIZE; u++)
INIT_LIST_HEAD(&mount_hashtable[u]);
- br_lock_init(vfsmount_lock);
+ br_lock_init(&vfsmount_lock);
err = sysfs_init();
if (err)
@@ -2596,9 +2598,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
if (!atomic_dec_and_test(&ns->count))
return;
down_write(&namespace_sem);
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
umount_tree(ns->root, 0, &umount_list);
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
kfree(ns);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3ff5fcc1528..122e260247f 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -221,6 +221,10 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
already_written = 0;
+ errno = file_update_time(file);
+ if (errno)
+ goto outrel;
+
bouncebuffer = vmalloc(bufsize);
if (!bouncebuffer) {
errno = -EIO; /* -ENOMEM */
@@ -252,8 +256,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
}
vfree(bouncebuffer);
- file_update_time(file);
-
*ppos = pos;
if (pos > i_size_read(inode)) {
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 4af803f1351..54cc0cdb3dc 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -23,17 +23,17 @@ struct ncp_mount_data_kernel {
unsigned long flags; /* NCP_MOUNT_* flags */
unsigned int int_flags; /* internal flags */
#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
- __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */
+ uid_t mounted_uid; /* Who may umount() this filesystem? */
struct pid *wdog_pid; /* Who cares for our watchdog packets? */
unsigned int ncp_fd; /* The socket to the ncp port */
unsigned int time_out; /* How long should I wait after
sending a NCP request? */
unsigned int retry_count; /* And how often should I retry? */
unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
- __kernel_uid32_t uid;
- __kernel_gid32_t gid;
- __kernel_mode_t file_mode;
- __kernel_mode_t dir_mode;
+ uid_t uid;
+ gid_t gid;
+ umode_t file_mode;
+ umode_t dir_mode;
int info_fd;
};
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index eb95f5091c1..970659daa32 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
#include <net/inet_sock.h>
@@ -253,6 +254,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
char svc_name[12];
int ret = 0;
int minorversion_setup;
+ struct net *net = current->nsproxy->net_ns;
mutex_lock(&nfs_callback_mutex);
if (cb_info->users++ || cb_info->task != NULL) {
@@ -265,6 +267,12 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto out_err;
}
+ ret = svc_bind(serv, net);
+ if (ret < 0) {
+ printk(KERN_WARNING "NFS: bind callback service failed\n");
+ goto out_err;
+ }
+
minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion,
serv, xprt, &rqstp, &callback_svc);
if (!minorversion_setup) {
@@ -306,6 +314,8 @@ out_err:
dprintk("NFS: Couldn't create callback socket or server thread; "
"err = %d\n", ret);
cb_info->users--;
+ if (serv)
+ svc_shutdown_net(serv, net);
goto out;
}
@@ -320,6 +330,7 @@ void nfs_callback_down(int minorversion)
cb_info->users--;
if (cb_info->users == 0 && cb_info->task != NULL) {
kthread_stop(cb_info->task);
+ svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns);
svc_exit_thread(cb_info->rqst);
cb_info->serv = NULL;
cb_info->rqst = NULL;
@@ -332,7 +343,7 @@ void nfs_callback_down(int minorversion)
int
check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
{
- char *p = svc_gss_principal(rqstp);
+ char *p = rqstp->rq_cred.cr_principal;
if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
return 1;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0989a209968..f430057ff3b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1354,10 +1354,10 @@ out:
}
#ifdef CONFIG_NFS_V4
-static int nfs_open_revalidate(struct dentry *, struct nameidata *);
+static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *);
const struct dentry_operations nfs4_dentry_operations = {
- .d_revalidate = nfs_open_revalidate,
+ .d_revalidate = nfs4_lookup_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
.d_automount = nfs_d_automount,
@@ -1519,13 +1519,11 @@ no_open:
return nfs_lookup(dir, dentry, nd);
}
-static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct dentry *parent = NULL;
struct inode *inode;
struct inode *dir;
- struct nfs_open_context *ctx;
- struct iattr attr;
int openflags, ret = 0;
if (nd->flags & LOOKUP_RCU)
@@ -1554,57 +1552,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
/* We cannot do exclusive creation on a positive dentry */
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
goto no_open_dput;
- /* We can't create new files here */
- openflags &= ~(O_CREAT|O_EXCL);
-
- ctx = create_nfs_open_context(dentry, openflags);
- ret = PTR_ERR(ctx);
- if (IS_ERR(ctx))
- goto out;
- attr.ia_valid = ATTR_OPEN;
- if (openflags & O_TRUNC) {
- attr.ia_valid |= ATTR_SIZE;
- attr.ia_size = 0;
- nfs_wb_all(inode);
- }
-
- /*
- * Note: we're not holding inode->i_mutex and so may be racing with
- * operations that change the directory. We therefore save the
- * change attribute *before* we do the RPC call.
- */
- inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- switch (ret) {
- case -EPERM:
- case -EACCES:
- case -EDQUOT:
- case -ENOSPC:
- case -EROFS:
- goto out_put_ctx;
- default:
- goto out_drop;
- }
- }
- iput(inode);
- if (inode != dentry->d_inode)
- goto out_drop;
+ /* Let f_op->open() actually open (and revalidate) the file */
+ ret = 1;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- ret = nfs_intent_set_file(nd, ctx);
- if (ret >= 0)
- ret = 1;
out:
dput(parent);
return ret;
-out_drop:
- d_drop(dentry);
- ret = 0;
-out_put_ctx:
- put_nfs_open_context(ctx);
- goto out;
no_open_dput:
dput(parent);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 56311ca5f9f..a6708e6b438 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -879,12 +879,81 @@ const struct file_operations nfs_file_operations = {
static int
nfs4_file_open(struct inode *inode, struct file *filp)
{
+ struct nfs_open_context *ctx;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *parent = NULL;
+ struct inode *dir;
+ unsigned openflags = filp->f_flags;
+ struct iattr attr;
+ int err;
+
+ BUG_ON(inode != dentry->d_inode);
/*
- * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
- * this point, then something is very wrong
+ * If no cached dentry exists or if it's negative, NFSv4 handled the
+ * opens in ->lookup() or ->create().
+ *
+ * We only get this far for a cached positive dentry. We skipped
+ * revalidation, so handle it here by dropping the dentry and returning
+ * -EOPENSTALE. The VFS will retry the lookup/create/open.
*/
- dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
- return -ENOTDIR;
+
+ dprintk("NFS: open file(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+
+ if ((openflags & O_ACCMODE) == 3)
+ openflags--;
+
+ /* We can't create new files here */
+ openflags &= ~(O_CREAT|O_EXCL);
+
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+
+ ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ err = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ attr.ia_valid = ATTR_OPEN;
+ if (openflags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ nfs_wb_all(inode);
+ }
+
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ switch (err) {
+ case -EPERM:
+ case -EACCES:
+ case -EDQUOT:
+ case -ENOSPC:
+ case -EROFS:
+ goto out_put_ctx;
+ default:
+ goto out_drop;
+ }
+ }
+ iput(inode);
+ if (inode != dentry->d_inode)
+ goto out_drop;
+
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_file_set_open_context(filp, ctx);
+ err = 0;
+
+out_put_ctx:
+ put_nfs_open_context(ctx);
+out:
+ dput(parent);
+ return err;
+
+out_drop:
+ d_drop(dentry);
+ err = -EOPENSTALE;
+ goto out_put_ctx;
}
const struct file_operations nfs4_file_operations = {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 204438cc914..34a10d78b83 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -11,7 +11,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
for (f = exp->ex_flavors; f < end; f++) {
- if (f->pseudoflavor == rqstp->rq_flavor)
+ if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
return f->flags;
}
return exp->ex_flags;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index dcb52b88451..ba233499b9a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static struct cache_head *svc_export_alloc(void)
return NULL;
}
-struct cache_detail svc_export_cache_template = {
+static struct cache_detail svc_export_cache_template = {
.owner = THIS_MODULE,
.hash_size = EXPORT_HASHMAX,
.name = "nfsd.export",
@@ -904,13 +904,13 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
return 0;
/* ip-address based client; check sec= export option: */
for (f = exp->ex_flavors; f < end; f++) {
- if (f->pseudoflavor == rqstp->rq_flavor)
+ if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
return 0;
}
/* defaults in absence of sec= options: */
if (exp->ex_nflavors == 0) {
- if (rqstp->rq_flavor == RPC_AUTH_NULL ||
- rqstp->rq_flavor == RPC_AUTH_UNIX)
+ if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
+ rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
return 0;
}
return nfserr_wrongsec;
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 9559ce46873..e6c38159622 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -58,6 +58,7 @@ static int nfsd_inject_set(void *op_ptr, u64 val)
static int nfsd_inject_get(void *data, u64 *val)
{
+ *val = 0;
return 0;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c8e9f637153..a5fd6b982f2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -650,9 +650,10 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
struct rpc_clnt *client;
if (clp->cl_minorversion == 0) {
- if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+ if (!clp->cl_cred.cr_principal &&
+ (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
return -EINVAL;
- args.client_name = clp->cl_principal;
+ args.client_name = clp->cl_cred.cr_principal;
args.prognumber = conn->cb_prog,
args.protocol = XPRT_TRANSPORT_TCP;
args.authflavor = clp->cl_flavor;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 286a7f8f202..dae36f1dee9 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -605,7 +605,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
static __be32
do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
{
- if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+ if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
if (numeric_name_to_id(rqstp, type, name, namelen, id))
return 0;
/*
@@ -618,7 +618,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
static int
do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
{
- if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+ if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
return sprintf(name, "%u", id);
return idmap_id_to_name(rqstp, type, id, name);
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ed3f9206a0e..5ff0b7b9fc0 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -570,7 +570,7 @@ static ssize_t
cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
struct cld_upcall *tmp, *cup;
- struct cld_msg *cmsg = (struct cld_msg *)src;
+ struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
uint32_t xid;
struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
nfsd_net_id);
@@ -1029,7 +1029,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
return ret;
}
-struct notifier_block nfsd4_cld_block = {
+static struct notifier_block nfsd4_cld_block = {
.notifier_call = rpc_pipefs_event,
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 03f82c0bc35..8fdc9ec5c5d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
#include <linux/sunrpc/clnt.h>
#include "xdr4.h"
#include "vfs.h"
+#include "current_stateid.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -447,37 +448,69 @@ static struct list_head close_lru;
*
* which we should reject.
*/
-static void
-set_access(unsigned int *access, unsigned long bmap) {
+static unsigned int
+bmap_to_share_mode(unsigned long bmap) {
int i;
+ unsigned int access = 0;
- *access = 0;
for (i = 1; i < 4; i++) {
if (test_bit(i, &bmap))
- *access |= i;
- }
-}
-
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
- int i;
-
- *deny = 0;
- for (i = 0; i < 4; i++) {
- if (test_bit(i, &bmap))
- *deny |= i ;
+ access |= i;
}
+ return access;
}
-static int
+static bool
test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
unsigned int access, deny;
- set_access(&access, stp->st_access_bmap);
- set_deny(&deny, stp->st_deny_bmap);
+ access = bmap_to_share_mode(stp->st_access_bmap);
+ deny = bmap_to_share_mode(stp->st_deny_bmap);
if ((access & open->op_share_deny) || (deny & open->op_share_access))
- return 0;
- return 1;
+ return false;
+ return true;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ __set_bit(access, &stp->st_access_bmap);
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ __clear_bit(access, &stp->st_access_bmap);
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ return test_bit(access, &stp->st_access_bmap);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+ __set_bit(access, &stp->st_deny_bmap);
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+ __clear_bit(access, &stp->st_deny_bmap);
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+ return test_bit(access, &stp->st_deny_bmap);
}
static int nfs4_access_to_omode(u32 access)
@@ -493,6 +526,20 @@ static int nfs4_access_to_omode(u32 access)
BUG();
}
+/* release all access and file references for a given stateid */
+static void
+release_all_access(struct nfs4_ol_stateid *stp)
+{
+ int i;
+
+ for (i = 1; i < 4; i++) {
+ if (test_access(i, stp))
+ nfs4_file_put_access(stp->st_file,
+ nfs4_access_to_omode(i));
+ clear_access(i, stp);
+ }
+}
+
static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
{
list_del(&stp->st_perfile);
@@ -501,16 +548,7 @@ static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
static void close_generic_stateid(struct nfs4_ol_stateid *stp)
{
- int i;
-
- if (stp->st_access_bmap) {
- for (i = 1; i < 4; i++) {
- if (test_bit(i, &stp->st_access_bmap))
- nfs4_file_put_access(stp->st_file,
- nfs4_access_to_omode(i));
- __clear_bit(i, &stp->st_access_bmap);
- }
- }
+ release_all_access(stp);
put_nfs4_file(stp->st_file);
stp->st_file = NULL;
}
@@ -885,7 +923,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
struct nfsd4_session *new;
struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
int numslots, slotsize;
- int status;
+ __be32 status;
int idx;
/*
@@ -984,7 +1022,8 @@ static inline void
renew_client_locked(struct nfs4_client *clp)
{
if (is_client_expired(clp)) {
- dprintk("%s: client (clientid %08x/%08x) already expired\n",
+ WARN_ON(1);
+ printk("%s: client (clientid %08x/%08x) already expired\n",
__func__,
clp->cl_clientid.cl_boot,
clp->cl_clientid.cl_id);
@@ -1049,9 +1088,7 @@ free_client(struct nfs4_client *clp)
list_del(&ses->se_perclnt);
nfsd4_put_session_locked(ses);
}
- if (clp->cl_cred.cr_group_info)
- put_group_info(clp->cl_cred.cr_group_info);
- kfree(clp->cl_principal);
+ free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
kfree(clp);
}
@@ -1132,12 +1169,21 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
{
+ if (source->cr_principal) {
+ target->cr_principal =
+ kstrdup(source->cr_principal, GFP_KERNEL);
+ if (target->cr_principal == NULL)
+ return -ENOMEM;
+ } else
+ target->cr_principal = NULL;
+ target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
target->cr_group_info = source->cr_group_info;
get_group_info(target->cr_group_info);
+ return 0;
}
static int same_name(const char *n1, const char *n2)
@@ -1157,11 +1203,31 @@ same_clid(clientid_t *cl1, clientid_t *cl2)
return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
}
-/* XXX what about NGROUP */
+static bool groups_equal(struct group_info *g1, struct group_info *g2)
+{
+ int i;
+
+ if (g1->ngroups != g2->ngroups)
+ return false;
+ for (i=0; i<g1->ngroups; i++)
+ if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
+ return false;
+ return true;
+}
+
static int
same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
{
- return cr1->cr_uid == cr2->cr_uid;
+ if ((cr1->cr_flavor != cr2->cr_flavor)
+ || (cr1->cr_uid != cr2->cr_uid)
+ || (cr1->cr_gid != cr2->cr_gid)
+ || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
+ return false;
+ if (cr1->cr_principal == cr2->cr_principal)
+ return true;
+ if (!cr1->cr_principal || !cr2->cr_principal)
+ return false;
+ return 0 == strcmp(cr1->cr_principal, cr1->cr_principal);
}
static void gen_clid(struct nfs4_client *clp)
@@ -1204,25 +1270,20 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
{
struct nfs4_client *clp;
struct sockaddr *sa = svc_addr(rqstp);
- char *princ;
+ int ret;
clp = alloc_client(name);
if (clp == NULL)
return NULL;
INIT_LIST_HEAD(&clp->cl_sessions);
-
- princ = svc_gss_principal(rqstp);
- if (princ) {
- clp->cl_principal = kstrdup(princ, GFP_KERNEL);
- if (clp->cl_principal == NULL) {
- spin_lock(&client_lock);
- free_client(clp);
- spin_unlock(&client_lock);
- return NULL;
- }
+ ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+ if (ret) {
+ spin_lock(&client_lock);
+ free_client(clp);
+ spin_unlock(&client_lock);
+ return NULL;
}
-
idr_init(&clp->cl_stateids);
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_refcount, 0);
@@ -1240,8 +1301,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
copy_verf(clp, verf);
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
- clp->cl_flavor = rqstp->rq_flavor;
- copy_cred(&clp->cl_cred, &rqstp->rq_cred);
gen_confirm(clp);
clp->cl_cb_session = NULL;
return clp;
@@ -1470,18 +1529,32 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
clid->flags = new->cl_exchange_flags;
}
+static bool client_has_state(struct nfs4_client *clp)
+{
+ /*
+ * Note clp->cl_openowners check isn't quite right: there's no
+ * need to count owners without stateid's.
+ *
+ * Also note we should probably be using this in 4.0 case too.
+ */
+ return !list_empty(&clp->cl_openowners)
+ || !list_empty(&clp->cl_delegations)
+ || !list_empty(&clp->cl_sessions);
+}
+
__be32
nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_exchange_id *exid)
{
struct nfs4_client *unconf, *conf, *new;
- int status;
+ __be32 status;
unsigned int strhashval;
char dname[HEXDIR_LEN];
char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
struct sockaddr *sa = svc_addr(rqstp);
+ bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1507,71 +1580,63 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfs4_make_rec_clidname(dname, &exid->clname);
if (status)
- goto error;
+ return status;
strhashval = clientstr_hashval(dname);
+ /* Cases below refer to rfc 5661 section 18.35.4: */
nfs4_lock_state();
- status = nfs_ok;
-
conf = find_confirmed_client_by_str(dname, strhashval);
if (conf) {
- if (!clp_used_exchangeid(conf)) {
- status = nfserr_clid_inuse; /* XXX: ? */
- goto out;
- }
- if (!same_verf(&verf, &conf->cl_verifier)) {
- /* 18.35.4 case 8 */
- if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+ bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
+ bool verfs_match = same_verf(&verf, &conf->cl_verifier);
+
+ if (update) {
+ if (!clp_used_exchangeid(conf)) { /* buggy client */
+ status = nfserr_inval;
+ goto out;
+ }
+ if (!creds_match) { /* case 9 */
+ status = nfserr_perm;
+ goto out;
+ }
+ if (!verfs_match) { /* case 8 */
status = nfserr_not_same;
goto out;
}
- /* Client reboot: destroy old state */
- expire_client(conf);
- goto out_new;
+ /* case 6 */
+ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ new = conf;
+ goto out_copy;
}
- if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
- /* 18.35.4 case 9 */
- if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
- status = nfserr_perm;
+ if (!creds_match) { /* case 3 */
+ if (client_has_state(conf)) {
+ status = nfserr_clid_inuse;
goto out;
}
expire_client(conf);
goto out_new;
}
- /*
- * Set bit when the owner id and verifier map to an already
- * confirmed client id (18.35.3).
- */
- exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
-
- /*
- * Falling into 18.35.4 case 2, possible router replay.
- * Leave confirmed record intact and return same result.
- */
- copy_verf(conf, &verf);
- new = conf;
- goto out_copy;
+ if (verfs_match) { /* case 2 */
+ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ new = conf;
+ goto out_copy;
+ }
+ /* case 5, client reboot */
+ goto out_new;
}
- /* 18.35.4 case 7 */
- if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+ if (update) { /* case 7 */
status = nfserr_noent;
goto out;
}
unconf = find_unconfirmed_client_by_str(dname, strhashval);
- if (unconf) {
- /*
- * Possible retry or client restart. Per 18.35.4 case 4,
- * a new unconfirmed record should be generated regardless
- * of whether any properties have changed.
- */
+ if (unconf) /* case 4, possible retry or client restart */
expire_client(unconf);
- }
+ /* case 1 (normal case) */
out_new:
- /* Normal case */
new = create_client(exid->clname, dname, rqstp, &verf);
if (new == NULL) {
status = nfserr_jukebox;
@@ -1584,7 +1649,7 @@ out_copy:
exid->clientid.cl_boot = new->cl_clientid.cl_boot;
exid->clientid.cl_id = new->cl_clientid.cl_id;
- exid->seqid = 1;
+ exid->seqid = new->cl_cs_slot.sl_seqid + 1;
nfsd4_set_ex_flags(new, exid);
dprintk("nfsd4_exchange_id seqid %d flags %x\n",
@@ -1593,12 +1658,10 @@ out_copy:
out:
nfs4_unlock_state();
-error:
- dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
return status;
}
-static int
+static __be32
check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
{
dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
@@ -1626,7 +1689,7 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
*/
static void
nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
- struct nfsd4_clid_slot *slot, int nfserr)
+ struct nfsd4_clid_slot *slot, __be32 nfserr)
{
slot->sl_status = nfserr;
memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
@@ -1657,7 +1720,7 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
/* seqid, slotID, slotID, slotID, status */ \
5 ) * sizeof(__be32))
-static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
{
return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
|| fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
@@ -1673,7 +1736,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_session *new;
struct nfsd4_clid_slot *cs_slot = NULL;
bool confirm_me = false;
- int status = 0;
+ __be32 status = 0;
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
@@ -1686,16 +1749,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
- dprintk("Got a create_session replay! seqid= %d\n",
- cs_slot->sl_seqid);
- /* Return the cached reply status */
status = nfsd4_replay_create_session(cr_ses, cs_slot);
goto out;
} else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
status = nfserr_seq_misordered;
- dprintk("Sequence misordered!\n");
- dprintk("Expected seqid= %d but got seqid= %d\n",
- cs_slot->sl_seqid, cr_ses->seqid);
goto out;
}
} else if (unconf) {
@@ -1704,7 +1761,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
goto out;
}
-
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
@@ -1712,7 +1768,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_seq_misordered;
goto out;
}
-
confirm_me = true;
conf = unconf;
} else {
@@ -1749,8 +1804,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
/* cache solo and embedded create sessions under the state lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
- if (confirm_me)
+ if (confirm_me) {
+ unsigned int hash = clientstr_hashval(unconf->cl_recdir);
+ struct nfs4_client *old =
+ find_confirmed_client_by_str(conf->cl_recdir, hash);
+ if (old)
+ expire_client(old);
move_to_confirmed(conf);
+ }
out:
nfs4_unlock_state();
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1818,7 +1879,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
struct nfsd4_destroy_session *sessionid)
{
struct nfsd4_session *ses;
- u32 status = nfserr_badsession;
+ __be32 status = nfserr_badsession;
/* Notes:
* - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1914,7 +1975,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_session *session;
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
- int status;
+ __be32 status;
if (resp->opcnt != 1)
return nfserr_sequence_pos;
@@ -2008,18 +2069,11 @@ out:
return status;
}
-static inline bool has_resources(struct nfs4_client *clp)
-{
- return !list_empty(&clp->cl_openowners)
- || !list_empty(&clp->cl_delegations)
- || !list_empty(&clp->cl_sessions);
-}
-
__be32
nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
{
struct nfs4_client *conf, *unconf, *clp;
- int status = 0;
+ __be32 status = 0;
nfs4_lock_state();
unconf = find_unconfirmed_client(&dc->clientid);
@@ -2028,7 +2082,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
if (conf) {
clp = conf;
- if (!is_client_expired(conf) && has_resources(conf)) {
+ if (!is_client_expired(conf) && client_has_state(conf)) {
status = nfserr_clientid_busy;
goto out;
}
@@ -2055,7 +2109,7 @@ out:
__be32
nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
{
- int status = 0;
+ __be32 status = 0;
if (rc->rca_one_fs) {
if (!cstate->current_fh.fh_dentry)
@@ -2106,17 +2160,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- /*
- * XXX The Duplicate Request Cache (DRC) has been checked (??)
- * We get here on a DRC miss.
- */
-
strhashval = clientstr_hashval(dname);
+ /* Cases below refer to rfc 3530 section 14.2.33: */
nfs4_lock_state();
conf = find_confirmed_client_by_str(dname, strhashval);
if (conf) {
- /* RFC 3530 14.2.33 CASE 0: */
+ /* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
goto out;
@@ -2129,63 +2179,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
}
- /*
- * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
- * has a description of SETCLIENTID request processing consisting
- * of 5 bullet points, labeled as CASE0 - CASE4 below.
- */
unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ if (unconf)
+ expire_client(unconf);
status = nfserr_jukebox;
- if (!conf) {
- /*
- * RFC 3530 14.2.33 CASE 4:
- * placed first, because it is the normal case
- */
- if (unconf)
- expire_client(unconf);
- new = create_client(clname, dname, rqstp, &clverifier);
- if (new == NULL)
- goto out;
- gen_clid(new);
- } else if (same_verf(&conf->cl_verifier, &clverifier)) {
- /*
- * RFC 3530 14.2.33 CASE 1:
- * probable callback update
- */
- if (unconf) {
- /* Note this is removing unconfirmed {*x***},
- * which is stronger than RFC recommended {vxc**}.
- * This has the advantage that there is at most
- * one {*x***} in either list at any time.
- */
- expire_client(unconf);
- }
- new = create_client(clname, dname, rqstp, &clverifier);
- if (new == NULL)
- goto out;
+ new = create_client(clname, dname, rqstp, &clverifier);
+ if (new == NULL)
+ goto out;
+ if (conf && same_verf(&conf->cl_verifier, &clverifier))
+ /* case 1: probable callback update */
copy_clid(new, conf);
- } else if (!unconf) {
- /*
- * RFC 3530 14.2.33 CASE 2:
- * probable client reboot; state will be removed if
- * confirmed.
- */
- new = create_client(clname, dname, rqstp, &clverifier);
- if (new == NULL)
- goto out;
- gen_clid(new);
- } else {
- /*
- * RFC 3530 14.2.33 CASE 3:
- * probable client reboot; state will be removed if
- * confirmed.
- */
- expire_client(unconf);
- new = create_client(clname, dname, rqstp, &clverifier);
- if (new == NULL)
- goto out;
+ else /* case 4 (new client) or cases 2, 3 (client reboot): */
gen_clid(new);
- }
/*
* XXX: we should probably set this at creation time, and check
* for consistent minorversion use throughout:
@@ -2203,17 +2208,11 @@ out:
}
-/*
- * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
- * a description of SETCLIENTID_CONFIRM request processing consisting of 4
- * bullets, labeled as CASE1 - CASE4 below.
- */
__be32
nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid_confirm *setclientid_confirm)
{
- struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -2221,84 +2220,44 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
if (STALE_CLIENTID(clid))
return nfserr_stale_clientid;
- /*
- * XXX The Duplicate Request Cache (DRC) has been checked (??)
- * We get here on a DRC miss.
- */
-
nfs4_lock_state();
conf = find_confirmed_client(clid);
unconf = find_unconfirmed_client(clid);
-
- status = nfserr_clid_inuse;
- if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
- goto out;
- if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
- goto out;
-
/*
- * section 14.2.34 of RFC 3530 has a description of
- * SETCLIENTID_CONFIRM request processing consisting
- * of 4 bullet points, labeled as CASE1 - CASE4 below.
+ * We try hard to give out unique clientid's, so if we get an
+ * attempt to confirm the same clientid with a different cred,
+ * there's a bug somewhere. Let's charitably assume it's our
+ * bug.
*/
- if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
- /*
- * RFC 3530 14.2.34 CASE 1:
- * callback update
- */
- if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
- status = nfserr_clid_inuse;
- else {
- nfsd4_change_callback(conf, &unconf->cl_cb_conn);
- nfsd4_probe_callback(conf);
- expire_client(unconf);
+ status = nfserr_serverfault;
+ if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+ goto out;
+ if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+ goto out;
+ /* cases below refer to rfc 3530 section 14.2.34: */
+ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+ if (conf && !unconf) /* case 2: probable retransmit */
status = nfs_ok;
+ else /* case 4: client hasn't noticed we rebooted yet? */
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+ status = nfs_ok;
+ if (conf) { /* case 1: callback update */
+ nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+ nfsd4_probe_callback(conf);
+ expire_client(unconf);
+ } else { /* case 3: normal case; new or rebooted client */
+ unsigned int hash = clientstr_hashval(unconf->cl_recdir);
+ conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+ if (conf) {
+ nfsd4_client_record_remove(conf);
+ expire_client(conf);
}
- } else if (conf && !unconf) {
- /*
- * RFC 3530 14.2.34 CASE 2:
- * probable retransmitted request; play it safe and
- * do nothing.
- */
- if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
- status = nfserr_clid_inuse;
- else
- status = nfs_ok;
- } else if (!conf && unconf
- && same_verf(&unconf->cl_confirm, &confirm)) {
- /*
- * RFC 3530 14.2.34 CASE 3:
- * Normal case; new or rebooted client:
- */
- if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
- status = nfserr_clid_inuse;
- } else {
- unsigned int hash =
- clientstr_hashval(unconf->cl_recdir);
- conf = find_confirmed_client_by_str(unconf->cl_recdir,
- hash);
- if (conf) {
- nfsd4_client_record_remove(conf);
- expire_client(conf);
- }
- move_to_confirmed(unconf);
- conf = unconf;
- nfsd4_probe_callback(conf);
- status = nfs_ok;
- }
- } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
- && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
- &confirm)))) {
- /*
- * RFC 3530 14.2.34 CASE 4:
- * Client probably hasn't noticed that we rebooted yet.
- */
- status = nfserr_stale_clientid;
- } else {
- /* check that we have hit one of the cases...*/
- status = nfserr_clid_inuse;
+ move_to_confirmed(unconf);
+ nfsd4_probe_callback(unconf);
}
out:
nfs4_unlock_state();
@@ -2454,8 +2413,8 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_file = fp;
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
- __set_bit(open->op_share_access, &stp->st_access_bmap);
- __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+ set_access(open->op_share_access, stp);
+ set_deny(open->op_share_deny, stp);
stp->st_openstp = NULL;
}
@@ -2534,8 +2493,8 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
ret = nfserr_locked;
/* Search for conflicting share reservations */
list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
- if (test_bit(deny_type, &stp->st_deny_bmap) ||
- test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
+ if (test_deny(deny_type, stp) ||
+ test_deny(NFS4_SHARE_DENY_BOTH, stp))
goto out;
}
ret = nfs_ok;
@@ -2791,7 +2750,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
bool new_access;
__be32 status;
- new_access = !test_bit(op_share_access, &stp->st_access_bmap);
+ new_access = !test_access(op_share_access, stp);
if (new_access) {
status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
if (status)
@@ -2806,8 +2765,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
return status;
}
/* remember the open */
- __set_bit(op_share_access, &stp->st_access_bmap);
- __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+ set_access(op_share_access, stp);
+ set_deny(open->op_share_deny, stp);
return nfs_ok;
}
@@ -3282,18 +3241,18 @@ STALE_STATEID(stateid_t *stateid)
}
static inline int
-access_permit_read(unsigned long access_bmap)
+access_permit_read(struct nfs4_ol_stateid *stp)
{
- return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) ||
- test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap);
+ return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+ test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+ test_access(NFS4_SHARE_ACCESS_WRITE, stp);
}
static inline int
-access_permit_write(unsigned long access_bmap)
+access_permit_write(struct nfs4_ol_stateid *stp)
{
- return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+ return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+ test_access(NFS4_SHARE_ACCESS_BOTH, stp);
}
static
@@ -3304,9 +3263,9 @@ __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
/* For lock stateid's, we test the parent open, not the lock: */
if (stp->st_openstp)
stp = stp->st_openstp;
- if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
+ if ((flags & WR_STATE) && !access_permit_write(stp))
goto out;
- if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
+ if ((flags & RD_STATE) && !access_permit_read(stp))
goto out;
status = nfs_ok;
out:
@@ -3346,7 +3305,7 @@ static bool stateid_generation_after(stateid_t *a, stateid_t *b)
return (s32)a->si_generation - (s32)b->si_generation > 0;
}
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
+static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
* When sessions are used the stateid generation number is ignored
@@ -3655,10 +3614,10 @@ out:
static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
{
- if (!test_bit(access, &stp->st_access_bmap))
+ if (!test_access(access, stp))
return;
nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
- __clear_bit(access, &stp->st_access_bmap);
+ clear_access(access, stp);
}
static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
@@ -3680,12 +3639,12 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
}
static void
-reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
+reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
{
int i;
for (i = 0; i < 4; i++) {
if ((i & deny) != i)
- __clear_bit(i, bmap);
+ clear_deny(i, stp);
}
}
@@ -3712,19 +3671,19 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
if (status)
goto out;
status = nfserr_inval;
- if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
- dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
+ if (!test_access(od->od_share_access, stp)) {
+ dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
stp->st_access_bmap, od->od_share_access);
goto out;
}
- if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) {
+ if (!test_deny(od->od_share_deny, stp)) {
dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
stp->st_deny_bmap, od->od_share_deny);
goto out;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
- reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
+ reset_union_bmap_deny(od->od_share_deny, stp);
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4014,13 +3973,13 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
struct nfs4_file *fp = lock_stp->st_file;
int oflag = nfs4_access_to_omode(access);
- if (test_bit(access, &lock_stp->st_access_bmap))
+ if (test_access(access, lock_stp))
return;
nfs4_file_get_access(fp, oflag);
- __set_bit(access, &lock_stp->st_access_bmap);
+ set_access(access, lock_stp);
}
-__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
{
struct nfs4_file *fi = ost->st_file;
struct nfs4_openowner *oo = openowner(ost->st_stateowner);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 74c00bc92b9..4949667c84e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1674,12 +1674,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
static void write32(__be32 **p, u32 n)
{
- *(*p)++ = n;
+ *(*p)++ = htonl(n);
}
static void write64(__be32 **p, u64 n)
{
- write32(p, (u32)(n >> 32));
+ write32(p, (n >> 32));
write32(p, (u32)n);
}
@@ -1744,15 +1744,16 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _
}
/* Encode as an array of strings the string given with components
- * separated @sep.
+ * separated @sep, escaped with esc_enter and esc_exit.
*/
-static __be32 nfsd4_encode_components(char sep, char *components,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_components_esc(char sep, char *components,
+ __be32 **pp, int *buflen,
+ char esc_enter, char esc_exit)
{
__be32 *p = *pp;
__be32 *countp = p;
int strlen, count=0;
- char *str, *end;
+ char *str, *end, *next;
dprintk("nfsd4_encode_components(%s)\n", components);
if ((*buflen -= 4) < 0)
@@ -1760,8 +1761,23 @@ static __be32 nfsd4_encode_components(char sep, char *components,
WRITE32(0); /* We will fill this in with @count later */
end = str = components;
while (*end) {
- for (; *end && (*end != sep); end++)
- ; /* Point to end of component */
+ bool found_esc = false;
+
+ /* try to parse as esc_start, ..., esc_end, sep */
+ if (*str == esc_enter) {
+ for (; *end && (*end != esc_exit); end++)
+ /* find esc_exit or end of string */;
+ next = end + 1;
+ if (*end && (!*next || *next == sep)) {
+ str++;
+ found_esc = true;
+ }
+ }
+
+ if (!found_esc)
+ for (; *end && (*end != sep); end++)
+ /* find sep or end of string */;
+
strlen = end - str;
if (strlen) {
if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
@@ -1780,6 +1796,15 @@ static __be32 nfsd4_encode_components(char sep, char *components,
return 0;
}
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+ __be32 **pp, int *buflen)
+{
+ return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
+}
+
/*
* encode a location element of a fs_locations structure
*/
@@ -1789,7 +1814,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
__be32 status;
__be32 *p = *pp;
- status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+ status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
+ '[', ']');
if (status)
return status;
status = nfsd4_encode_components('/', location->path, &p, buflen);
@@ -3251,7 +3277,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
}
static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
{
__be32 *p;
@@ -3306,7 +3332,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
}
static __be32
-nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create_session *sess)
{
__be32 *p;
@@ -3355,14 +3381,14 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
}
static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_destroy_session *destroy_session)
{
return nfserr;
}
static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_free_stateid *free_stateid)
{
__be32 *p;
@@ -3371,13 +3397,13 @@ nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
return nfserr;
RESERVE_SPACE(4);
- WRITE32(nfserr);
+ *p++ = nfserr;
ADJUST_ARGS();
return nfserr;
}
static __be32
-nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_sequence *seq)
{
__be32 *p;
@@ -3399,8 +3425,8 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
return 0;
}
-__be32
-nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
+static __be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_test_stateid *test_stateid)
{
struct nfsd4_test_stateid_id *stateid, *next;
@@ -3503,7 +3529,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
* Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
* will be at least a page and will therefore hold the xdr_buf head.
*/
-int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
{
struct xdr_buf *xb = &resp->rqstp->rq_res;
struct nfsd4_session *session = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 72699885ac4..c55298ed577 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -661,6 +661,7 @@ static ssize_t __write_ports_addfd(char *buf)
{
char *mesg = buf;
int fd, err;
+ struct net *net = &init_net;
err = get_int(&mesg, &fd);
if (err != 0 || fd < 0)
@@ -672,6 +673,8 @@ static ssize_t __write_ports_addfd(char *buf)
err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
if (err < 0) {
+ if (nfsd_serv->sv_nrthreads == 1)
+ svc_shutdown_net(nfsd_serv, net);
svc_destroy(nfsd_serv);
return err;
}
@@ -709,6 +712,7 @@ static ssize_t __write_ports_addxprt(char *buf)
char transport[16];
struct svc_xprt *xprt;
int port, err;
+ struct net *net = &init_net;
if (sscanf(buf, "%15s %4u", transport, &port) != 2)
return -EINVAL;
@@ -720,12 +724,12 @@ static ssize_t __write_ports_addxprt(char *buf)
if (err != 0)
return err;
- err = svc_create_xprt(nfsd_serv, transport, &init_net,
+ err = svc_create_xprt(nfsd_serv, transport, net,
PF_INET, port, SVC_SOCK_ANONYMOUS);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nfsd_serv, transport, &init_net,
+ err = svc_create_xprt(nfsd_serv, transport, net,
PF_INET6, port, SVC_SOCK_ANONYMOUS);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
@@ -734,12 +738,14 @@ static ssize_t __write_ports_addxprt(char *buf)
nfsd_serv->sv_nrthreads--;
return 0;
out_close:
- xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
+ xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
svc_close_xprt(xprt);
svc_xprt_put(xprt);
}
out_err:
+ if (nfsd_serv->sv_nrthreads == 1)
+ svc_shutdown_net(nfsd_serv, net);
svc_destroy(nfsd_serv);
return err;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cb4d51d8cbd..ee709fc8f58 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
+#include <linux/nsproxy.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
@@ -330,6 +331,8 @@ static int nfsd_get_default_max_blksize(void)
int nfsd_create_serv(void)
{
+ int error;
+
WARN_ON(!mutex_is_locked(&nfsd_mutex));
if (nfsd_serv) {
svc_get(nfsd_serv);
@@ -343,6 +346,12 @@ int nfsd_create_serv(void)
if (nfsd_serv == NULL)
return -ENOMEM;
+ error = svc_bind(nfsd_serv, current->nsproxy->net_ns);
+ if (error < 0) {
+ svc_destroy(nfsd_serv);
+ return error;
+ }
+
set_max_drc();
do_gettimeofday(&nfssvc_boot); /* record boot time */
return 0;
@@ -373,6 +382,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
int i = 0;
int tot = 0;
int err = 0;
+ struct net *net = &init_net;
WARN_ON(!mutex_is_locked(&nfsd_mutex));
@@ -417,6 +427,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
if (err)
break;
}
+
+ if (nfsd_serv->sv_nrthreads == 1)
+ svc_shutdown_net(nfsd_serv, net);
svc_destroy(nfsd_serv);
return err;
@@ -432,6 +445,7 @@ nfsd_svc(unsigned short port, int nrservs)
{
int error;
bool nfsd_up_before;
+ struct net *net = &init_net;
mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
@@ -464,6 +478,8 @@ out_shutdown:
if (error < 0 && !nfsd_up_before)
nfsd_shutdown();
out_destroy:
+ if (nfsd_serv->sv_nrthreads == 1)
+ svc_shutdown_net(nfsd_serv, net);
svc_destroy(nfsd_serv); /* Release server */
out:
mutex_unlock(&nfsd_mutex);
@@ -547,6 +563,9 @@ nfsd(void *vrqstp)
nfsdstats.th_cnt --;
out:
+ if (rqstp->rq_server->sv_nrthreads == 1)
+ svc_shutdown_net(rqstp->rq_server, &init_net);
+
/* Release the thread */
svc_exit_thread(rqstp);
@@ -659,8 +678,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
int ret = seq_release(inode, file);
+ struct net *net = &init_net;
+
mutex_lock(&nfsd_mutex);
/* this function really, really should have been called svc_put() */
+ if (nfsd_serv->sv_nrthreads == 1)
+ svc_shutdown_net(nfsd_serv, net);
svc_destroy(nfsd_serv);
mutex_unlock(&nfsd_mutex);
return ret;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 89ab137d379..849091e16ea 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -232,7 +232,6 @@ struct nfs4_client {
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
u32 cl_flavor; /* setclientid pseudoflavor */
- char *cl_principal; /* setclientid principal name */
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 1b3501598ab..acd127d4ee8 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -60,7 +60,7 @@ struct nfsd4_compound_state {
__be32 *datap;
size_t iovlen;
u32 minorversion;
- u32 status;
+ __be32 status;
stateid_t current_stateid;
stateid_t save_stateid;
/* to indicate current and saved state id presents */
@@ -364,7 +364,7 @@ struct nfsd4_test_stateid_id {
};
struct nfsd4_test_stateid {
- __be32 ts_num_ids;
+ u32 ts_num_ids;
struct list_head ts_stateid_list;
};
@@ -549,7 +549,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
struct nfsd4_compoundargs *);
int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
struct nfsd4_compoundres *);
-int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 0bb2c2010b9..b72847988b7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -508,31 +508,29 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
}
-static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
- int connectable)
+static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+ struct inode *parent)
{
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
- struct inode *inode = dentry->d_inode;
struct nilfs_root *root = NILFS_I(inode)->i_root;
int type;
- if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
- (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+ if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
+ *lenp = NILFS_FID_SIZE_CONNECTABLE;
+ return 255;
+ }
+ if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
+ *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
return 255;
+ }
fid->cno = root->cno;
fid->ino = inode->i_ino;
fid->gen = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent->d_inode;
+ if (parent) {
fid->parent_ino = parent->i_ino;
fid->parent_gen = parent->i_generation;
- spin_unlock(&dentry->d_lock);
-
type = FILEID_NILFS_WITH_PARENT;
*lenp = NILFS_FID_SIZE_CONNECTABLE;
} else {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ccb14d3fc0d..b39c5c161ad 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -123,7 +123,7 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);
-static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
__u32 mask, void *data,
@@ -168,10 +168,10 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
vfsmount_test_mask &= ~inode_mark->ignored_mask;
}
- pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+ pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
" inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
" data=%p data_is=%d cookie=%d event=%p\n",
- __func__, group, to_tell, mnt, mask, inode_mark,
+ __func__, group, to_tell, mask, inode_mark,
inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
data_is, cookie, *event);
@@ -258,16 +258,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
if (inode_group > vfsmount_group) {
/* handle inode */
- ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+ ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
data_is, cookie, file_name, &event);
/* we didn't use the vfsmount_mark */
vfsmount_group = NULL;
} else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data,
+ ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
data_is, cookie, file_name, &event);
inode_group = NULL;
} else {
- ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark,
+ ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
mask, data, data_is, cookie, file_name,
&event);
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8639169221c..7389d2d5e51 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2096,7 +2096,9 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
err = file_remove_suid(file);
if (err)
goto out;
- file_update_time(file);
+ err = file_update_time(file);
+ if (err)
+ goto out;
written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
count);
out:
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index c7ee03c2222..0725e605465 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
struct ocfs2_blockcheck_stats *stats)
{
int rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc;
ocfs2_blockcheck_inc_check(stats);
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
ecc = ocfs2_hamming_encode_block(data, blocksize);
- ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
/* And check the crc32 again */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e) {
+ if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
@@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_blockcheck_stats *stats)
{
int i, rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc, fix;
BUG_ON(nr < 0);
@@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
ocfs2_blockcheck_inc_check(stats);
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
for (i = 0, ecc = 0; i < nr; i++) {
@@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i);
}
- fix = ecc ^ check.bc_ecc;
+ fix = ecc ^ bc_ecc;
for (i = 0; i < nr; i++) {
/*
* Try the fix against each buffer. It will only affect
@@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
/* And check the crc32 again */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e) {
+ if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 3a3ed4bb794..fbec0be6232 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -293,7 +293,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
struct list_head *iter, *head=NULL;
- u64 cookie;
+ __be64 cookie;
u32 flags;
u8 node;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a5952ceecba..de854cca12a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -679,7 +679,7 @@ struct dlm_query_join_packet {
};
union dlm_query_join_response {
- u32 intval;
+ __be32 intval;
struct dlm_query_join_packet packet;
};
@@ -755,8 +755,8 @@ struct dlm_query_region {
struct dlm_node_info {
u8 ni_nodenum;
u8 pad1;
- u16 ni_ipv4_port;
- u32 ni_ipv4_address;
+ __be16 ni_ipv4_port;
+ __be32 ni_ipv4_address;
};
struct dlm_query_nodeinfo {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 92f2ead0fab..9e89d70df33 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -818,7 +818,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
union dlm_query_join_response response;
response.packet = *packet;
- *wire = cpu_to_be32(response.intval);
+ *wire = be32_to_cpu(response.intval);
}
static void dlm_query_join_wire_to_packet(u32 wire,
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 745db42528d..322216a5f0d 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -177,21 +177,23 @@ bail:
return parent;
}
-static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
- int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
__le32 *fh = (__force __le32 *) fh_in;
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then. Somehow"
trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
dentry->d_name.name,
fh, len, connectable);
+#endif
- if (connectable && (len < 6)) {
+ if (parent && (len < 6)) {
*max_len = 6;
type = 255;
goto bail;
@@ -211,12 +213,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
+ if (parent) {
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
@@ -224,8 +221,6 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
- spin_unlock(&dentry->d_lock);
-
len = 6;
type = 2;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 735514ca400..d89e08a81ed 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -273,11 +273,13 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_gid = le32_to_cpu(fe->i_gid);
/* Fast symlinks will have i_size but no allocated clusters. */
- if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
inode->i_blocks = 0;
- else
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+ } else {
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mapping->a_ops = &ocfs2_aops;
+ inode->i_mapping->a_ops = &ocfs2_aops;
+ }
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -331,10 +333,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
- if (ocfs2_inode_is_fast_symlink(inode))
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
- else
- inode->i_op = &ocfs2_symlink_inode_operations;
+ inode->i_op = &ocfs2_symlink_inode_operations;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index a1a1bfd652c..d96f7f81d8d 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -864,7 +864,7 @@ int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
if (status)
break;
- reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+ reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
if (!reqp) {
status = -EINVAL;
goto bail;
@@ -888,9 +888,11 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct ocfs2_space_resv sr;
struct ocfs2_new_group_input input;
struct reflink_arguments args;
- const char *old_path, *new_path;
+ const char __user *old_path;
+ const char __user *new_path;
bool preserve;
struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -937,17 +939,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return ocfs2_group_add(inode, &input);
case OCFS2_IOC_REFLINK:
- if (copy_from_user(&args, (struct reflink_arguments *)arg,
- sizeof(args)))
+ if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
- old_path = (const char *)(unsigned long)args.old_path;
- new_path = (const char *)(unsigned long)args.new_path;
+ old_path = (const char __user *)(unsigned long)args.old_path;
+ new_path = (const char __user *)(unsigned long)args.new_path;
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
case OCFS2_IOC_INFO:
- if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
- sizeof(struct ocfs2_info)))
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
@@ -960,22 +960,20 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&range, (struct fstrim_range *)arg,
- sizeof(range)))
+ if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
- if (copy_to_user((struct fstrim_range *)arg, &range,
- sizeof(range)))
+ if (copy_to_user(argp, &range, sizeof(range)))
return -EFAULT;
return 0;
}
case OCFS2_IOC_MOVE_EXT:
- return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
+ return ocfs2_ioctl_move_extents(filp, argp);
default:
return -ENOTTY;
}
@@ -988,6 +986,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
struct reflink_arguments args;
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
@@ -1006,16 +1005,14 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case FITRIM:
break;
case OCFS2_IOC_REFLINK:
- if (copy_from_user(&args, (struct reflink_arguments *)arg,
- sizeof(args)))
+ if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
compat_ptr(args.new_path), preserve);
case OCFS2_IOC_INFO:
- if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
- sizeof(struct ocfs2_info)))
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index b1e3fce72ea..6083432f667 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1082,8 +1082,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
context->file = filp;
if (argp) {
- if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
- sizeof(range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
status = -EFAULT;
goto out;
}
@@ -1138,8 +1137,7 @@ out:
* length and new_offset even if failure happens somewhere.
*/
if (argp) {
- if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
- sizeof(range)))
+ if (copy_to_user(argp, &range, sizeof(range)))
status = -EFAULT;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a9856e3eaaf..9f39c640cdd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1724,15 +1724,16 @@ static int ocfs2_symlink(struct inode *dir,
fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
inode->i_rdev = 0;
newsize = l - 1;
+ inode->i_op = &ocfs2_symlink_inode_operations;
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
- inode->i_op = &ocfs2_symlink_inode_operations;
status = dquot_alloc_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status)
goto bail;
did_quota = 1;
+ inode->i_mapping->a_ops = &ocfs2_aops;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
@@ -1750,7 +1751,7 @@ static int ocfs2_symlink(struct inode *dir,
i_size_write(inode, newsize);
inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
memcpy((char *) fe->id2.i_symlink, symname, l);
i_size_write(inode, newsize);
inode->i_blocks = 0;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 5d22872e2bb..f1fbb4b552a 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -54,101 +54,40 @@
#include "buffer_head_io.h"
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
- struct buffer_head **bh)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
{
- int status;
- char *link = NULL;
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *bh;
+ int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
+ const char *link;
+ void *kaddr;
+ size_t len;
- status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
- link = ERR_PTR(status);
- goto bail;
+ return status;
}
- fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ fe = (struct ocfs2_dinode *) bh->b_data;
link = (char *) fe->id2.i_symlink;
-bail:
-
- return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
- char __user *buffer,
- int buflen)
-{
- int ret;
- char *link;
- struct buffer_head *bh = NULL;
- struct inode *inode = dentry->d_inode;
-
- link = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(link)) {
- ret = PTR_ERR(link);
- goto out;
- }
-
- /*
- * Without vfsmount we can't update atime now,
- * but we will update atime here ultimately.
- */
- ret = vfs_readlink(dentry, buffer, buflen, link);
-
+ /* will be less than a page size */
+ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, link, len + 1);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ unlock_page(page);
brelse(bh);
-out:
- if (ret < 0)
- mlog_errno(ret);
- return ret;
+ return 0;
}
-static void *ocfs2_fast_follow_link(struct dentry *dentry,
- struct nameidata *nd)
-{
- int status = 0;
- int len;
- char *target, *link = ERR_PTR(-ENOMEM);
- struct inode *inode = dentry->d_inode;
- struct buffer_head *bh = NULL;
-
- BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
- target = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(target)) {
- status = PTR_ERR(target);
- mlog_errno(status);
- goto bail;
- }
-
- /* Fast symlinks can't be large */
- len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
- link = kzalloc(len + 1, GFP_NOFS);
- if (!link) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- memcpy(link, target, len);
-
-bail:
- nd_set_link(nd, status ? ERR_PTR(status) : link);
- brelse(bh);
-
- if (status)
- mlog_errno(status);
- return NULL;
-}
-
-static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
- char *link = nd_get_link(nd);
- if (!IS_ERR(link))
- kfree(link);
-}
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+ .readpage = ocfs2_fast_symlink_readpage,
+};
const struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = page_readlink,
+ .readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.getattr = ocfs2_getattr,
@@ -159,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
};
-const struct inode_operations ocfs2_fast_symlink_inode_operations = {
- .readlink = ocfs2_readlink,
- .follow_link = ocfs2_fast_follow_link,
- .put_link = ocfs2_fast_put_link,
- .getattr = ocfs2_getattr,
- .setattr = ocfs2_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
- .fiemap = ocfs2_fiemap,
-};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 65a6c9c6ad5..71ee4245e91 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -27,7 +27,7 @@
#define OCFS2_SYMLINK_H
extern const struct inode_operations ocfs2_symlink_inode_operations;
-extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
/*
* Test whether an inode is a fast symlink.
diff --git a/fs/open.c b/fs/open.c
index d54301219d0..d6c79a0dffc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -654,10 +654,23 @@ static inline int __get_file_write_access(struct inode *inode,
return error;
}
-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
- struct file *f,
- int (*open)(struct inode *, struct file *),
- const struct cred *cred)
+int open_check_o_direct(struct file *f)
+{
+ /* NB: we're sure to have correct a_ops only after f_op->open */
+ if (f->f_flags & O_DIRECT) {
+ if (!f->f_mapping->a_ops ||
+ ((!f->f_mapping->a_ops->direct_IO) &&
+ (!f->f_mapping->a_ops->get_xip_mem))) {
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+ struct file *f,
+ int (*open)(struct inode *, struct file *),
+ const struct cred *cred)
{
static const struct file_operations empty_fops = {};
struct inode *inode;
@@ -713,16 +726,6 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops ||
- ((!f->f_mapping->a_ops->direct_IO) &&
- (!f->f_mapping->a_ops->get_xip_mem))) {
- fput(f);
- f = ERR_PTR(-EINVAL);
- }
- }
-
return f;
cleanup_all:
@@ -744,12 +747,29 @@ cleanup_all:
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
cleanup_file:
- put_filp(f);
dput(dentry);
mntput(mnt);
return ERR_PTR(error);
}
+static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+ struct file *f,
+ int (*open)(struct inode *, struct file *),
+ const struct cred *cred)
+{
+ struct file *res = do_dentry_open(dentry, mnt, f, open, cred);
+ if (!IS_ERR(res)) {
+ int error = open_check_o_direct(f);
+ if (error) {
+ fput(res);
+ res = ERR_PTR(error);
+ }
+ } else {
+ put_filp(f);
+ }
+ return res;
+}
+
/**
* lookup_instantiate_filp - instantiates the open intent filp
* @nd: pointer to nameidata
@@ -804,13 +824,31 @@ struct file *nameidata_to_filp(struct nameidata *nd)
/* Pick up the filp from the open intent */
filp = nd->intent.open.file;
- nd->intent.open.file = NULL;
/* Has the filesystem initialised the file for us? */
- if (filp->f_path.dentry == NULL) {
+ if (filp->f_path.dentry != NULL) {
+ nd->intent.open.file = NULL;
+ } else {
+ struct file *res;
+
path_get(&nd->path);
- filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
- NULL, cred);
+ res = do_dentry_open(nd->path.dentry, nd->path.mnt,
+ filp, NULL, cred);
+ if (!IS_ERR(res)) {
+ int error;
+
+ nd->intent.open.file = NULL;
+ BUG_ON(res != filp);
+
+ error = open_check_o_direct(filp);
+ if (error) {
+ fput(filp);
+ filp = ERR_PTR(error);
+ }
+ } else {
+ /* Allow nd->intent.open.file to be recycled */
+ filp = res;
+ }
}
return filp;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 95ebb56de49..49c1065256f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -654,8 +654,11 @@ out:
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- if (ret > 0)
- file_update_time(filp);
+ if (ret > 0) {
+ int err = file_update_time(filp);
+ if (err)
+ ret = err;
+ }
return ret;
}
diff --git a/fs/pnode.c b/fs/pnode.c
index ab5fa9e1a79..bed378db075 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
prev_src_mnt = child;
}
out:
- br_write_lock(vfsmount_lock);
+ br_write_lock(&vfsmount_lock);
while (!list_empty(&tmp_list)) {
child = list_first_entry(&tmp_list, struct mount, mnt_hash);
umount_tree(child, 0, &umount_list);
}
- br_write_unlock(vfsmount_lock);
+ br_write_unlock(&vfsmount_lock);
release_mounts(&umount_list);
return ret;
}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 12412852d88..5e289a7cbad 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
poll_wait(file, &p->ns->poll, wait);
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
if (p->m.poll_event != ns->event) {
p->m.poll_event = ns->event;
res |= POLLERR | POLLPRI;
}
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return res;
}
diff --git a/fs/readdir.c b/fs/readdir.c
index cc0a8227cdd..39e3370d79c 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -108,11 +108,11 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
int error;
struct file * file;
struct readdir_callback buf;
+ int fput_needed;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.result = 0;
buf.dirent = dirent;
@@ -121,8 +121,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
if (buf.result)
error = buf.result;
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -195,16 +194,15 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct file * file;
struct linux_dirent __user * lastdirent;
struct getdents_callback buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -221,8 +219,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -278,16 +275,15 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct file * file;
struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -305,7 +301,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 59d06871a85..a6d4268fb6c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1592,13 +1592,12 @@ struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
(fh_type == 6) ? fid->raw[5] : 0);
}
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
- int need_parent)
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
int maxlen = *lenp;
- if (need_parent && (maxlen < 5)) {
+ if (parent && (maxlen < 5)) {
*lenp = 5;
return 255;
} else if (maxlen < 3) {
@@ -1610,20 +1609,15 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
data[2] = inode->i_generation;
*lenp = 3;
- /* no room for directory info? return what we've stored so far */
- if (maxlen < 5 || !need_parent)
- return 3;
-
- spin_lock(&dentry->d_lock);
- inode = dentry->d_parent->d_inode;
- data[3] = inode->i_ino;
- data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
- *lenp = 5;
- if (maxlen >= 6) {
- data[5] = inode->i_generation;
- *lenp = 6;
- }
- spin_unlock(&dentry->d_lock);
+ if (parent) {
+ data[3] = parent->i_ino;
+ data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+ *lenp = 5;
+ if (maxlen >= 6) {
+ data[5] = parent->i_generation;
+ *lenp = 6;
+ }
+ }
return *lenp;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b1a08573fe1..afcadcc03e8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1923,6 +1923,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
* the workqueue job (flush_async_commit) needs this lock
*/
reiserfs_write_unlock(sb);
+
+ cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
flush_workqueue(commit_wq);
if (!reiserfs_mounted_fs_count) {
@@ -3231,8 +3233,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
th->t_trans_id, journal->j_trans_id);
}
- sb->s_dirt = 1;
-
prepared = test_clear_buffer_journal_prepared(bh);
clear_buffer_journal_restore_dirty(bh);
/* already in this transaction, we are done */
@@ -3316,6 +3316,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
journal->j_first = cn;
journal->j_last = cn;
}
+ reiserfs_schedule_old_flush(sb);
return 0;
}
@@ -3492,7 +3493,7 @@ static void flush_async_commits(struct work_struct *work)
** flushes any old transactions to disk
** ends the current transaction if it is too old
*/
-int reiserfs_flush_old_commits(struct super_block *sb)
+void reiserfs_flush_old_commits(struct super_block *sb)
{
time_t now;
struct reiserfs_transaction_handle th;
@@ -3502,9 +3503,8 @@ int reiserfs_flush_old_commits(struct super_block *sb)
/* safety check so we don't flush while we are replaying the log during
* mount
*/
- if (list_empty(&journal->j_journal_list)) {
- return 0;
- }
+ if (list_empty(&journal->j_journal_list))
+ return;
/* check the current transaction. If there are no writers, and it is
* too old, finish it, and force the commit blocks to disk
@@ -3526,7 +3526,6 @@ int reiserfs_flush_old_commits(struct super_block *sb)
do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
}
}
- return sb->s_dirt;
}
/*
@@ -3955,7 +3954,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
** it tells us if we should continue with the journal_end, or just return
*/
if (!check_journal_end(th, sb, nblocks, flags)) {
- sb->s_dirt = 1;
+ reiserfs_schedule_old_flush(sb);
wake_queued_writers(sb);
reiserfs_async_progress_wait(sb);
goto out;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index a59d2712633..33215f57ea0 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -480,6 +480,11 @@ struct reiserfs_sb_info {
struct dentry *priv_root; /* root of /.reiserfs_priv */
struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
int j_errno;
+
+ int work_queued; /* non-zero delayed work is queued */
+ struct delayed_work old_work; /* old transactions flush delayed work */
+ spinlock_t old_work_lock; /* protects old_work and work_queued */
+
#ifdef CONFIG_QUOTA
char *s_qf_names[MAXQUOTAS];
int s_jquota_fmt;
@@ -2452,7 +2457,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
int reiserfs_commit_page(struct inode *inode, struct page *page,
unsigned from, unsigned to);
-int reiserfs_flush_old_commits(struct super_block *);
+void reiserfs_flush_old_commits(struct super_block *);
int reiserfs_commit_for_inode(struct inode *);
int reiserfs_inode_needs_commit(struct inode *);
void reiserfs_update_inode_transaction(struct inode *);
@@ -2487,6 +2492,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
int reiserfs_allocate_list_bitmaps(struct super_block *s,
struct reiserfs_list_bitmap *, unsigned int);
+void reiserfs_schedule_old_flush(struct super_block *s);
void add_save_link(struct reiserfs_transaction_handle *th,
struct inode *inode, int truncate);
int remove_save_link(struct inode *inode, int truncate);
@@ -2611,8 +2617,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
- int connectable);
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+ struct inode *parent);
int reiserfs_truncate_file(struct inode *, int update_timestamps);
void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 9a17f63c3fd..3ce02cff5e9 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -200,7 +200,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
(bmap_nr_new - bmap_nr)));
PUT_SB_BLOCK_COUNT(s, block_count_new);
PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
- s->s_dirt = 1;
journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c07b7d70944..651ce767b55 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -72,20 +72,58 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
if (!journal_begin(&th, s, 1))
if (!journal_end_sync(&th, s, 1))
reiserfs_flush_old_commits(s);
- s->s_dirt = 0; /* Even if it's not true.
- * We'll loop forever in sync_supers otherwise */
reiserfs_write_unlock(s);
return 0;
}
-static void reiserfs_write_super(struct super_block *s)
+static void flush_old_commits(struct work_struct *work)
{
+ struct reiserfs_sb_info *sbi;
+ struct super_block *s;
+
+ sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
+ s = sbi->s_journal->j_work_sb;
+
+ spin_lock(&sbi->old_work_lock);
+ sbi->work_queued = 0;
+ spin_unlock(&sbi->old_work_lock);
+
reiserfs_sync_fs(s, 1);
}
+void reiserfs_schedule_old_flush(struct super_block *s)
+{
+ struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+ unsigned long delay;
+
+ if (s->s_flags & MS_RDONLY)
+ return;
+
+ spin_lock(&sbi->old_work_lock);
+ if (!sbi->work_queued) {
+ delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+ queue_delayed_work(system_long_wq, &sbi->old_work, delay);
+ sbi->work_queued = 1;
+ }
+ spin_unlock(&sbi->old_work_lock);
+}
+
+static void cancel_old_flush(struct super_block *s)
+{
+ struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
+ cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+ spin_lock(&sbi->old_work_lock);
+ sbi->work_queued = 0;
+ spin_unlock(&sbi->old_work_lock);
+}
+
static int reiserfs_freeze(struct super_block *s)
{
struct reiserfs_transaction_handle th;
+
+ cancel_old_flush(s);
+
reiserfs_write_lock(s);
if (!(s->s_flags & MS_RDONLY)) {
int err = journal_begin(&th, s, 1);
@@ -99,7 +137,6 @@ static int reiserfs_freeze(struct super_block *s)
journal_end_sync(&th, s, 1);
}
}
- s->s_dirt = 0;
reiserfs_write_unlock(s);
return 0;
}
@@ -483,9 +520,6 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_lock(s);
- if (s->s_dirt)
- reiserfs_write_super(s);
-
/* change file system state to current state if it was mounted with read-write permissions */
if (!(s->s_flags & MS_RDONLY)) {
if (!journal_begin(&th, s, 10)) {
@@ -692,7 +726,6 @@ static const struct super_operations reiserfs_sops = {
.dirty_inode = reiserfs_dirty_inode,
.evict_inode = reiserfs_evict_inode,
.put_super = reiserfs_put_super,
- .write_super = reiserfs_write_super,
.sync_fs = reiserfs_sync_fs,
.freeze_fs = reiserfs_freeze,
.unfreeze_fs = reiserfs_unfreeze,
@@ -1400,7 +1433,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
err = journal_end(&th, s, 10);
if (err)
goto out_err;
- s->s_dirt = 0;
if (!(*mount_flags & MS_RDONLY)) {
dquot_resume(s, -1);
@@ -1730,19 +1762,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
return -ENOMEM;
s->s_fs_info = sbi;
/* Set default values for options: non-aggressive tails, RO on errors */
- REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
- REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
- REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
+ sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+ sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+ sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
/* no preallocation minimum, be smart in
reiserfs_file_write instead */
- REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
+ sbi->s_alloc_options.preallocmin = 0;
/* Preallocate by 16 blocks (17-1) at once */
- REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+ sbi->s_alloc_options.preallocsize = 17;
/* setup default block allocator options */
reiserfs_init_alloc_options(s);
- mutex_init(&REISERFS_SB(s)->lock);
- REISERFS_SB(s)->lock_depth = -1;
+ spin_lock_init(&sbi->old_work_lock);
+ INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
+ mutex_init(&sbi->lock);
+ sbi->lock_depth = -1;
jdev_name = NULL;
if (reiserfs_parse_options
@@ -1751,8 +1785,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
goto error_unlocked;
}
if (jdev_name && jdev_name[0]) {
- REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
- if (!REISERFS_SB(s)->s_jdev) {
+ sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
+ if (!sbi->s_jdev) {
SWARN(silent, s, "", "Cannot allocate memory for "
"journal device name");
goto error;
@@ -1810,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
/* make data=ordered the default */
if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
!reiserfs_data_writeback(s)) {
- REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+ sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
}
if (reiserfs_data_log(s)) {
@@ -2003,6 +2037,8 @@ error_unlocked:
reiserfs_write_unlock(s);
}
+ cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+
reiserfs_free_bitmap_cache(s);
if (SB_BUFFER_WITH_SB(s))
brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7ae2a574cb2..9f35a37173d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -269,12 +269,13 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
if (ufd < 0)
kfree(ctx);
} else {
- struct file *file = fget(ufd);
+ int fput_needed;
+ struct file *file = fget_light(ufd, &fput_needed);
if (!file)
return -EBADF;
ctx = file->private_data;
if (file->f_op != &signalfd_fops) {
- fput(file);
+ fput_light(file, fput_needed);
return -EINVAL;
}
spin_lock_irq(&current->sighand->siglock);
@@ -282,7 +283,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
spin_unlock_irq(&current->sighand->siglock);
wake_up(&current->sighand->signalfd_wqh);
- fput(file);
+ fput_light(file, fput_needed);
}
return ufd;
diff --git a/fs/splice.c b/fs/splice.c
index 406ef2b792c..c9f1318a3b8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1003,8 +1003,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ret = file_remove_suid(out);
if (!ret) {
- file_update_time(out);
- ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+ ret = file_update_time(out);
+ if (!ret)
+ ret = splice_from_pipe_feed(pipe, &sd,
+ pipe_to_file);
}
mutex_unlock(&inode->i_mutex);
} while (ret > 0);
diff --git a/fs/statfs.c b/fs/statfs.c
index 43e6b6fe4e8..95ad5c0e586 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -87,11 +87,12 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
int fd_statfs(int fd, struct kstatfs *st)
{
- struct file *file = fget(fd);
+ int fput_needed;
+ struct file *file = fget_light(fd, &fput_needed);
int error = -EBADF;
if (file) {
error = vfs_statfs(&file->f_path, st);
- fput(file);
+ fput_light(file, fput_needed);
}
return error;
}
diff --git a/fs/sync.c b/fs/sync.c
index 0e8db939d96..11e3d1c4490 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -188,11 +188,12 @@ static int do_fsync(unsigned int fd, int datasync)
{
struct file *file;
int ret = -EBADF;
+ int fput_needed;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (file) {
ret = vfs_fsync(file, datasync);
- fput(file);
+ fput_light(file, fput_needed);
}
return ret;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 62a2727f4ec..a6d42efc76d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1127,16 +1127,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- stat->dev = inode->i_sb->s_dev;
- stat->ino = inode->i_ino;
- stat->mode = inode->i_mode;
- stat->nlink = inode->i_nlink;
- stat->uid = inode->i_uid;
- stat->gid = inode->i_gid;
- stat->rdev = inode->i_rdev;
- stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
+ generic_fillattr(inode, stat);
stat->blksize = UBIFS_BLOCK_SIZE;
stat->size = ui->ui_size;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a165c66e3ee..18024178ac4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1260,16 +1260,15 @@ static struct dentry *udf_fh_to_parent(struct super_block *sb,
fid->udf.parent_partref,
fid->udf.parent_generation);
}
-static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
- int connectable)
+static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+ struct inode *parent)
{
int len = *lenp;
- struct inode *inode = de->d_inode;
struct kernel_lb_addr location = UDF_I(inode)->i_location;
struct fid *fid = (struct fid *)fh;
int type = FILEID_UDF_WITHOUT_PARENT;
- if (connectable && (len < 5)) {
+ if (parent && (len < 5)) {
*lenp = 5;
return 255;
} else if (len < 3) {
@@ -1282,14 +1281,11 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
fid->udf.partref = location.partitionReferenceNum;
fid->udf.generation = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- spin_lock(&de->d_lock);
- inode = de->d_parent->d_inode;
- location = UDF_I(inode)->i_location;
+ if (parent) {
+ location = UDF_I(parent)->i_location;
fid->udf.parent_block = location.logicalBlockNum;
fid->udf.parent_partref = location.partitionReferenceNum;
fid->udf.parent_generation = inode->i_generation;
- spin_unlock(&de->d_lock);
*lenp = 5;
type = FILEID_UDF_WITH_PARENT;
}
diff --git a/fs/utimes.c b/fs/utimes.c
index ba653f3dc1b..fa4dbe451e2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -140,18 +140,19 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
goto out;
if (filename == NULL && dfd != AT_FDCWD) {
+ int fput_needed;
struct file *file;
if (flags & AT_SYMLINK_NOFOLLOW)
goto out;
- file = fget(dfd);
+ file = fget_light(dfd, &fput_needed);
error = -EBADF;
if (!file)
goto out;
error = utimes_common(&file->f_path, times);
- fput(file);
+ fput_light(file, fput_needed);
} else {
struct path path;
int lookup_flags = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 3c8c1cc333c..1d7ac379045 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
+ int fput_needed;
struct file *f;
struct dentry *dentry;
int error = -EBADF;
- f = fget(fd);
+ f = fget_light(fd, &fput_needed);
if (!f)
return error;
dentry = f->f_path.dentry;
@@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
error = setxattr(dentry, name, value, size, flags);
mnt_drop_write_file(f);
}
- fput(f);
+ fput_light(f, fput_needed);
return error;
}
@@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
void __user *, value, size_t, size)
{
+ int fput_needed;
struct file *f;
ssize_t error = -EBADF;
- f = fget(fd);
+ f = fget_light(fd, &fput_needed);
if (!f)
return error;
audit_inode(NULL, f->f_path.dentry);
error = getxattr(f->f_path.dentry, name, value, size);
- fput(f);
+ fput_light(f, fput_needed);
return error;
}
@@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
+ int fput_needed;
struct file *f;
ssize_t error = -EBADF;
- f = fget(fd);
+ f = fget_light(fd, &fput_needed);
if (!f)
return error;
audit_inode(NULL, f->f_path.dentry);
error = listxattr(f->f_path.dentry, list, size);
- fput(f);
+ fput_light(f, fput_needed);
return error;
}
@@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
+ int fput_needed;
struct file *f;
struct dentry *dentry;
int error = -EBADF;
- f = fget(fd);
+ f = fget_light(fd, &fput_needed);
if (!f)
return error;
dentry = f->f_path.dentry;
@@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
error = removexattr(dentry, name);
mnt_drop_write_file(f);
}
- fput(f);
+ fput_light(f, fput_needed);
return error;
}
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a907de565db..4a7286c1dc8 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
}
void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, xfs_km_flags_t flags)
{
int retries = 0;
gfp_t lflags = kmem_flags_convert(flags);
@@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
}
void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
{
void *ptr;
@@ -87,7 +87,7 @@ kmem_free(const void *ptr)
void *
kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
- unsigned int __nocast flags)
+ xfs_km_flags_t flags)
{
void *new;
@@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
}
void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
{
int retries = 0;
gfp_t lflags = kmem_flags_convert(flags);
@@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
}
void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
{
void *ptr;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index ab7c53fe346..b2f2620f9a8 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -27,10 +27,11 @@
* General memory allocation interfaces
*/
-#define KM_SLEEP 0x0001u
-#define KM_NOSLEEP 0x0002u
-#define KM_NOFS 0x0004u
-#define KM_MAYFAIL 0x0008u
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -38,7 +39,7 @@
* warnings, so we explicitly skip any generic ones (silly of us).
*/
static inline gfp_t
-kmem_flags_convert(unsigned int __nocast flags)
+kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
@@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
return lflags;
}
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
extern void kmem_free(const void *);
static inline void *kmem_zalloc_large(size_t size)
@@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone)
kmem_cache_destroy(zone);
}
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 2d25d19c4ea..42679223a0f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -52,19 +52,18 @@ static int xfs_fileid_length(int fileid_type)
STATIC int
xfs_fs_encode_fh(
- struct dentry *dentry,
- __u32 *fh,
- int *max_len,
- int connectable)
+ struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
{
struct fid *fid = (struct fid *)fh;
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
- struct inode *inode = dentry->d_inode;
int fileid_type;
int len;
/* Directories don't need their parent encoded, they have ".." */
- if (S_ISDIR(inode->i_mode) || !connectable)
+ if (!parent)
fileid_type = FILEID_INO32_GEN;
else
fileid_type = FILEID_INO32_GEN_PARENT;
@@ -96,20 +95,16 @@ xfs_fs_encode_fh(
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
- spin_lock(&dentry->d_lock);
- fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
- fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
- spin_unlock(&dentry->d_lock);
+ fid->i32.parent_ino = XFS_I(parent)->i_ino;
+ fid->i32.parent_gen = parent->i_generation;
/*FALLTHRU*/
case FILEID_INO32_GEN:
fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
break;
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
- spin_lock(&dentry->d_lock);
- fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
- fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
- spin_unlock(&dentry->d_lock);
+ fid64->parent_ino = XFS_I(parent)->i_ino;
+ fid64->parent_gen = parent->i_generation;
/*FALLTHRU*/
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
fid64->ino = XFS_I(inode)->i_ino;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8d214b87f6b..9f7ec15a652 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -586,8 +586,11 @@ restart:
* lock above. Eventually we should look into a way to avoid
* the pointless lock roundtrip.
*/
- if (likely(!(file->f_mode & FMODE_NOCMTIME)))
- file_update_time(file);
+ if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+ error = file_update_time(file);
+ if (error)
+ return error;
+ }
/*
* If we're writing the file then make sure to clear the setuid and
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6b965bf450e..f30d9807dc4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3152,7 +3152,7 @@ xlog_ticket_alloc(
int cnt,
char client,
bool permanent,
- int alloc_flags)
+ xfs_km_flags_t alloc_flags)
{
struct xlog_ticket *tic;
uint num_headers;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 735ff1ee53d..5bc33261f5b 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -555,7 +555,7 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
extern kmem_zone_t *xfs_log_ticket_zone;
struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
int count, char client, bool permanent,
- int alloc_flags);
+ xfs_km_flags_t alloc_flags);
static inline void
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index cdf896fcbfa..fdf324508c5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -584,7 +584,7 @@ xfs_trans_t *
_xfs_trans_alloc(
xfs_mount_t *mp,
uint type,
- uint memflags)
+ xfs_km_flags_t memflags)
{
xfs_trans_t *tp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7ab99e1898c..7c37b533aa8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -443,7 +443,7 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces.
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
uint, uint);